##Igore Warnings
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import sklearn
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split #train_test_split
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import Lasso, Ridge #Ridge and Lasso Regression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error #Evaluation Metrics
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
##Dataset file encoding check
import chardet
import os
my_file = os.path.join(r"C:\\Users\\Acer\\OneDrive\\Desktop\\New folder\\HousePrices.csv.csv")
rawdata = open(my_file, 'rb').readline()
my_encoding = chardet.detect(rawdata)
print("File encoding details are as follows:\n",my_encoding)
File encoding details are as follows:
{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
##Read housing dataset as 'df'
file_path = r"C:\\Users\\Acer\\OneDrive\\Desktop\\New folder\\HousePrices.csv.csv"
df = pd.read_csv(file_path)
df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
##Check shape of the dataset
total_records= df.shape[0] #storing total no. of records in a variable
df.shape
(1460, 81)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 1452 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
df.describe()
| Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1460.000000 | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1379.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
| mean | 730.500000 | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | 46.549315 | 567.240411 | 1057.429452 | 1162.626712 | 346.992466 | 5.844521 | 1515.463699 | 0.425342 | 0.057534 | 1.565068 | 0.382877 | 2.866438 | 1.046575 | 6.517808 | 0.613014 | 1978.506164 | 1.767123 | 472.980137 | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
| std | 421.610009 | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | 161.319273 | 441.866955 | 438.705324 | 386.587738 | 436.528436 | 48.623081 | 525.480383 | 0.518911 | 0.238753 | 0.550916 | 0.502885 | 0.815778 | 0.220338 | 1.625393 | 0.644666 | 24.689725 | 0.747315 | 213.804841 | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
| min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1900.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
| 25% | 365.750000 | 20.000000 | 59.000000 | 7553.500000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.000000 | 0.000000 | 0.000000 | 223.000000 | 795.750000 | 882.000000 | 0.000000 | 0.000000 | 1129.500000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 5.000000 | 0.000000 | 1961.000000 | 1.000000 | 334.500000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129975.000000 |
| 50% | 730.500000 | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | 0.000000 | 477.500000 | 991.500000 | 1087.000000 | 0.000000 | 0.000000 | 1464.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.000000 | 1980.000000 | 2.000000 | 480.000000 | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
| 75% | 1095.250000 | 70.000000 | 80.000000 | 11601.500000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.000000 | 712.250000 | 0.000000 | 808.000000 | 1298.250000 | 1391.250000 | 728.000000 | 0.000000 | 1776.750000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 7.000000 | 1.000000 | 2002.000000 | 2.000000 | 576.000000 | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
| max | 1460.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | 1474.000000 | 2336.000000 | 6110.000000 | 4692.000000 | 2065.000000 | 572.000000 | 5642.000000 | 3.000000 | 2.000000 | 3.000000 | 2.000000 | 8.000000 | 3.000000 | 14.000000 | 3.000000 | 2010.000000 | 4.000000 | 1418.000000 | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
##Stripping off white_spaces from the column name(if any)
df = df.rename(columns= lambda x: x.strip(), inplace=False)
##Checking and dropping duplicated records(if any)
df = df.drop_duplicates()
##Check null values if any(column wise: missing value percentage)
missing_val_cols_prcnt = round(100*df.isnull().sum()/len(df.index),2)
missing = pd.DataFrame(missing_val_cols_prcnt[missing_val_cols_prcnt!=0])
missing
| 0 | |
|---|---|
| LotFrontage | 17.74 |
| Alley | 93.77 |
| MasVnrType | 0.55 |
| MasVnrArea | 0.55 |
| BsmtQual | 2.53 |
| BsmtCond | 2.53 |
| BsmtExposure | 2.60 |
| BsmtFinType1 | 2.53 |
| BsmtFinType2 | 2.60 |
| Electrical | 0.07 |
| FireplaceQu | 47.26 |
| GarageType | 5.55 |
| GarageYrBlt | 5.55 |
| GarageFinish | 5.55 |
| GarageQual | 5.55 |
| GarageCond | 5.55 |
| PoolQC | 99.52 |
| Fence | 80.75 |
| MiscFeature | 96.30 |
#missing values dataframe (sorting values for graph)
my_cols = missing.index
vals = missing[0]
missing_df = pd.DataFrame({'features':my_cols,'percent':vals})
missing_df = missing_df.sort_values(by='percent', ascending=False)
##'Null Values Percentages' in the housing dataset, df: Barplot
sns.set(style='white')
plt.figure(figsize=(10,8), dpi=120)
ax_x= sns.barplot(x=missing_df.features, y=missing_df.percent)
for p in ax_x.patches:
ax_x.annotate(format(p.get_height(), '.2f')+"%", (p.get_x() + p.get_width() / 2.\
, p.get_height()), ha = 'center'\
, va = 'center', xytext = (0, 10), textcoords = 'offset points', fontsize =10, rotation=30)
plt.xticks(rotation=90, fontsize=10)
plt.xlabel('Features', fontsize= 14, fontstyle='italic')
plt.ylabel('(%)Percentage of Null' , fontsize= 14, fontstyle='italic')
plt.title('Null Value Percentages', fontsize=18,fontweight='bold')
plt.grid(True)
plt.tight_layout()
plt.autoscale()
plt.show()
##Inspecting null values in the categorical columns (only object types).
cat_obj_null_cols = df.select_dtypes(include='object') #Retaining only object types categorical columns and their null value %
missing_cat_obj_null_prcnt = round(100*cat_obj_null_cols.isnull().sum()/len(cat_obj_null_cols.index),2)
missing_cat_obj_null_prcnt = missing_cat_obj_null_prcnt[missing_cat_obj_null_prcnt!=0]
missing_cat_obj_null_prcnt
Alley 93.77 MasVnrType 0.55 BsmtQual 2.53 BsmtCond 2.53 BsmtExposure 2.60 BsmtFinType1 2.53 BsmtFinType2 2.60 Electrical 0.07 FireplaceQu 47.26 GarageType 5.55 GarageFinish 5.55 GarageQual 5.55 GarageCond 5.55 PoolQC 99.52 Fence 80.75 MiscFeature 96.30 dtype: float64
##Considering the aspect of 'meaningful missing'. Replace nulls with 'None' where null implies absence of feature (from data dictionary)
cat_null_obj_cols = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature','Electrical']
for column in cat_null_obj_cols:
df[column]= df[column].fillna('None')
##Inspecting null values in the categorical columns (only object types).
cat_obj_null_cols = df.select_dtypes(include='object') #Retaining only object types categorical columns
missing_cat_obj_null_prcnt = round(100*cat_obj_null_cols.isnull().sum()/len(cat_obj_null_cols.index),2)
missing_cat_obj_null_prcnt = missing_cat_obj_null_prcnt[missing_cat_obj_null_prcnt!=0]
missing_cat_obj_null_prcnt
Series([], dtype: float64)
##Removing categorical features that have more than 80% data associated to one single value.
def get_cols_imbal(data, prcnt):
df1= data.copy()
my_list= []
cat_obj_cols = df1.select_dtypes(include=['object'])
for col in (cat_obj_cols):
if(df1[col].value_counts().max() > int(prcnt*len(df1.index)/100)):
my_list.append(col)
return my_list
##Removing the skewed data
columns_to_be_removed = get_cols_imbal(data=df, prcnt=80)
print("Categorical object type columns removed:",columns_to_be_removed)
print("\nNumber of categorical object type columns removed: ", len(columns_to_be_removed),"\n")
##Dropping columns with skewed data (object type categorical variables)
df = df.drop(columns_to_be_removed, axis=1)
df.head()
Categorical object type columns removed: ['Street', 'Alley', 'LandContour', 'Utilities', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'RoofMatl', 'ExterCond', 'BsmtCond', 'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'] Number of categorical object type columns removed: 24
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | LotShape | LotConfig | Neighborhood | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | Foundation | BsmtQual | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | HeatingQC | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Reg | Inside | CollgCr | 2Story | 7 | 5 | 2003 | 2003 | Gable | VinylSd | VinylSd | BrkFace | 196.0 | Gd | PConc | Gd | No | GLQ | 706 | 0 | 150 | 856 | Ex | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | 0 | None | Attchd | 2003.0 | RFn | 2 | 548 | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 2008 | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Reg | FR2 | Veenker | 1Story | 6 | 8 | 1976 | 1976 | Gable | MetalSd | MetalSd | None | 0.0 | TA | CBlock | Gd | Gd | ALQ | 978 | 0 | 284 | 1262 | Ex | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460 | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | IR1 | Inside | CollgCr | 2Story | 7 | 5 | 2001 | 2002 | Gable | VinylSd | VinylSd | BrkFace | 162.0 | Gd | PConc | Gd | Mn | GLQ | 486 | 0 | 434 | 920 | Ex | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608 | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 2008 | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | IR1 | Corner | Crawfor | 2Story | 7 | 5 | 1915 | 1970 | Gable | Wd Sdng | Wd Shng | None | 0.0 | TA | BrkTil | TA | No | ALQ | 216 | 0 | 540 | 756 | Gd | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642 | 0 | 35 | 272 | 0 | 0 | 0 | 0 | 2 | 2006 | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | IR1 | FR2 | NoRidge | 2Story | 8 | 5 | 2000 | 2000 | Gable | VinylSd | VinylSd | BrkFace | 350.0 | Gd | PConc | Gd | Av | GLQ | 655 | 0 | 490 | 1145 | Ex | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836 | 192 | 84 | 0 | 0 | 0 | 0 | 0 | 12 | 2008 | 250000 |
df.shape
(1460, 57)
##Recheck null values in the categorical columns (only object types).
cat_obj_null_cols = df.select_dtypes(include='object') #Retaining only object types categorical columns
missing_cat_obj_null_prcnt = round(100*cat_obj_null_cols.isnull().sum()/len(cat_obj_null_cols.index),2)
missing_cat_obj_null_prcnt = missing_cat_obj_null_prcnt[missing_cat_obj_null_prcnt!=0]
missing_cat_obj_null_prcnt
Series([], dtype: float64)
##Creating a dataframe 'df_obj_cat' with object type categorical variables
obj_type_cat = df.select_dtypes(include='object').columns
df_obj_cat = df.loc[:,obj_type_cat]
df_obj_cat.head()
| MSZoning | LotShape | LotConfig | Neighborhood | HouseStyle | RoofStyle | Exterior1st | Exterior2nd | MasVnrType | ExterQual | Foundation | BsmtQual | BsmtExposure | BsmtFinType1 | HeatingQC | KitchenQual | FireplaceQu | GarageType | GarageFinish | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Reg | Inside | CollgCr | 2Story | Gable | VinylSd | VinylSd | BrkFace | Gd | PConc | Gd | No | GLQ | Ex | Gd | None | Attchd | RFn |
| 1 | RL | Reg | FR2 | Veenker | 1Story | Gable | MetalSd | MetalSd | None | TA | CBlock | Gd | Gd | ALQ | Ex | TA | TA | Attchd | RFn |
| 2 | RL | IR1 | Inside | CollgCr | 2Story | Gable | VinylSd | VinylSd | BrkFace | Gd | PConc | Gd | Mn | GLQ | Ex | Gd | TA | Attchd | RFn |
| 3 | RL | IR1 | Corner | Crawfor | 2Story | Gable | Wd Sdng | Wd Shng | None | TA | BrkTil | TA | No | ALQ | Gd | Gd | Gd | Detchd | Unf |
| 4 | RL | IR1 | FR2 | NoRidge | 2Story | Gable | VinylSd | VinylSd | BrkFace | Gd | PConc | Gd | Av | GLQ | Ex | Gd | TA | Attchd | RFn |
df_obj_cat.columns
Index(['MSZoning', 'LotShape', 'LotConfig', 'Neighborhood', 'HouseStyle',
'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
'Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC',
'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish'],
dtype='object')
def categorical_freq_prcnt_plt(data,f1,f2,f3):
if(f3!=0):
df= data.copy()
sns.set(style='white')
plt.figure(figsize=(14, 8), dpi=100)
plt.subplot(131) #subplot(131)
ab = pd.DataFrame((df[f1].value_counts(normalize=True)*100).round(0).sort_values())
ab.reset_index(inplace=True)
ab1 = sns.barplot(x='index',y=f1, data=ab, palette='Set1')
plt.setp(ab1.get_xticklabels(), rotation=90, horizontalalignment='right')
plt.xlabel(f1, fontsize= 14, fontstyle='italic')
plt.ylabel('Frequency(%count of total)', fontsize= 14, fontstyle='italic')
plt.title(f1+' Analysis', fontsize=16, fontweight='bold')
plt.grid(True)
for i in ab1.patches:
ab1.annotate(format(i.get_height())+"%", (i.get_x() + i.get_width() / 2.\
, i.get_height()), ha = 'center'\
, va = 'center', xytext = (0, 10), textcoords = 'offset points',rotation=45, fontsize=11)
plt.subplot(132) #subplot(132)
cn = pd.DataFrame((df[f2].value_counts(normalize=True)*100).round(0).sort_values())
cn.reset_index(inplace=True)
ab2 = sns.barplot(x='index',y=f2, data=cn, palette='Set1')
plt.setp(ab2.get_xticklabels(), rotation=90, horizontalalignment='right')
plt.xlabel(f2, fontsize= 14, fontstyle='italic')
plt.ylabel('Frequency(%count of total)', fontsize= 14, fontstyle='italic')
plt.title(f2+' Analysis', fontsize=16, fontweight='bold')
plt.grid(True)
for i in ab2.patches:
ab2.annotate(format(i.get_height())+"%", (i.get_x() + i.get_width() / 2.\
, i.get_height()), ha = 'center'\
, va = 'center', xytext = (0, 10), textcoords = 'offset points',rotation=45, fontsize=11)
plt.subplot(133) #subplot(133)
ef = pd.DataFrame((df[f3].value_counts(normalize=True)*100).round(0).sort_values())
ef.reset_index(inplace=True)
ab3 = sns.barplot(x='index',y=f3, data=ef, palette='Set1')
plt.setp(ab3.get_xticklabels(), rotation=90, horizontalalignment='right')
plt.xlabel(f3, fontsize= 14, fontstyle='italic')
plt.ylabel('Frequency(%count of total)', fontsize= 14, fontstyle='italic')
plt.title(f3+' Analysis', fontsize=16, fontweight='bold')
for i in ab3.patches:
ab3.annotate(format(i.get_height())+"%", (i.get_x() + i.get_width() / 2.\
, i.get_height()), ha = 'center'\
, va = 'center', xytext = (0, 10), textcoords = 'offset points',rotation=45, fontsize=11)
plt.grid(True)
plt.tight_layout()
plt.autoscale()
plt.show()
else:
df= data.copy()
sns.set(style='white')
plt.figure(figsize=(14, 8), dpi=100)
plt.subplot(121) #subplot(121)
ab = pd.DataFrame((df[f1].value_counts(normalize=True)*100).round(0).sort_values())
ab.reset_index(inplace=True)
ab1 = sns.barplot(x='index',y=f1, data=ab, palette='Set1')
plt.setp(ab1.get_xticklabels(), rotation=90, horizontalalignment='right')
plt.xlabel(f1, fontsize= 14, fontstyle='italic')
plt.ylabel('Frequency(%count of total)', fontsize= 14, fontstyle='italic')
plt.title(f1+' Analysis', fontsize=16, fontweight='bold')
plt.grid(True)
for i in ab1.patches:
ab1.annotate(format(i.get_height())+"%", (i.get_x() + i.get_width() / 2.\
, i.get_height()), ha = 'center'\
, va = 'center', xytext = (0, 10), textcoords = 'offset points',rotation=45, fontsize=11)
plt.subplot(122) #subplot(122)
cn = pd.DataFrame((df[f2].value_counts(normalize=True)*100).round(0).sort_values())
cn.reset_index(inplace=True)
ab2 = sns.barplot(x='index',y=f2, data=cn, palette='Set1')
plt.setp(ab2.get_xticklabels(), rotation=90, horizontalalignment='right')
plt.xlabel(f2, fontsize= 14, fontstyle='italic')
plt.ylabel('Frequency(%count of total)', fontsize= 14, fontstyle='italic')
plt.title(f2+' Analysis', fontsize=16, fontweight='bold')
for i in ab2.patches:
ab2.annotate(format(i.get_height())+"%", (i.get_x() + i.get_width() / 2.\
, i.get_height()), ha = 'center'\
, va = 'center', xytext = (0, 10), textcoords = 'offset points',rotation=45, fontsize=11)
plt.grid(True)
plt.tight_layout()
plt.autoscale()
plt.show()
categorical_freq_prcnt_plt(df,'MSZoning','LotShape','LotConfig')
categorical_freq_prcnt_plt(df,'KitchenQual', 'FireplaceQu','RoofStyle')
categorical_freq_prcnt_plt(df,'GarageType', 'GarageFinish', 'MasVnrType')
categorical_freq_prcnt_plt(df,'ExterQual','Foundation', 'BsmtQual')
categorical_freq_prcnt_plt(df,'BsmtExposure', 'BsmtFinType1', 'HeatingQC')
categorical_freq_prcnt_plt(df, 'Neighborhood', 'HouseStyle',0)
categorical_freq_prcnt_plt(df,'Exterior1st', 'Exterior2nd', 0)
##Inspecting null values in the numeric columns.
numeric_null_cols = df.select_dtypes(include=['int64','float64','int32','float32']) #Retaining only int and float data types for numeric columns
missing_numeric_null_prcnt = round(100*numeric_null_cols.isnull().sum()/len(numeric_null_cols.index),2)
missing_numeric_null_prcnt = missing_numeric_null_prcnt[missing_numeric_null_prcnt!=0]
missing_numeric_null_prcnt
LotFrontage 17.74 MasVnrArea 0.55 GarageYrBlt 5.55 dtype: float64
print("LotFrontage Stats:\n",df.LotFrontage.describe(percentiles=[0.1, .25, .50, .75, .85, .90, .95, .98, .99, 1]))
print("\nMasVnrArea Stats:\n",df.MasVnrArea.describe(percentiles=[0.1, .25, .50, .75, .85, .90, .95, .98, .99, 1]))
LotFrontage Stats: count 1201.000000 mean 70.049958 std 24.284752 min 21.000000 10% 44.000000 25% 59.000000 50% 69.000000 75% 80.000000 85% 90.000000 90% 96.000000 95% 107.000000 98% 124.000000 99% 141.000000 100% 313.000000 max 313.000000 Name: LotFrontage, dtype: float64 MasVnrArea Stats: count 1452.000000 mean 103.685262 std 181.066207 min 0.000000 10% 0.000000 25% 0.000000 50% 0.000000 75% 166.000000 85% 262.350000 90% 335.000000 95% 456.000000 98% 650.980000 99% 791.920000 100% 1600.000000 max 1600.000000 Name: MasVnrArea, dtype: float64
##For 'MasVnrArea', finding the median by removing all the values at 0.0 percentile to get a more precise value.
updated_MasVnrArea = df.MasVnrArea[~(df.MasVnrArea==0)]
print("\nMasVnrArea Stats:\n",updated_MasVnrArea.describe(percentiles=[0.1, .25, .50, .75, .85, .90, .95, .98, .99, 1]))
MasVnrArea Stats: count 591.000000 mean 254.739425 std 205.144174 min 1.000000 10% 70.000000 25% 113.000000 50% 203.000000 75% 330.500000 85% 424.500000 90% 481.000000 95% 650.500000 98% 862.000000 99% 1032.600000 100% 1600.000000 max 1600.000000 Name: MasVnrArea, dtype: float64
##Impute the null values with median values for 'LotFrontage' and 'MasVnrArea' columns
df['LotFrontage'] = df['LotFrontage'].replace(np.nan, df['LotFrontage'].median())
df['MasVnrArea'] = df['MasVnrArea'].replace(np.nan, updated_MasVnrArea.median())
##Filling the null values with 0 for 'GarageYrBlt' for now as we would be handling this column further below
df['GarageYrBlt']= df['GarageYrBlt'].fillna(0)
df['GarageYrBlt'] = df['GarageYrBlt'].astype(int)
##Recheck null values in the numeric columns.
numeric_null_cols = df.select_dtypes(include=['int64','float64','int32','float32']) #Retaining only int and float data types for numeric columns
missing_numeric_null_prcnt = round(100*numeric_null_cols.isnull().sum()/len(numeric_null_cols.index),2)
missing_numeric_null_prcnt = missing_numeric_null_prcnt[missing_numeric_null_prcnt!=0]
missing_numeric_null_prcnt
Series([], dtype: float64)
##Create a new column named 'WhetherRemodelled': This column would determine whether the house has been remodelled or not based on 'YearBuilt' anf 'YearRemodAdd'
def Remodel_Check(row): ##Based on the difference between remodelled and built years
if(row['YearBuilt'] == row['YearRemodAdd']):
return 0 #Indicate not Remodelled
elif (row['YearBuilt'] < row['YearRemodAdd']):
return 1 #Indicated Remodelled
else:
return 2 #Any other observation check
df['WhetherRemodelled'] = df.apply(Remodel_Check, axis=1)
df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | LotShape | LotConfig | Neighborhood | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | Foundation | BsmtQual | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | HeatingQC | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | WhetherRemodelled | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Reg | Inside | CollgCr | 2Story | 7 | 5 | 2003 | 2003 | Gable | VinylSd | VinylSd | BrkFace | 196.0 | Gd | PConc | Gd | No | GLQ | 706 | 0 | 150 | 856 | Ex | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | 0 | None | Attchd | 2003 | RFn | 2 | 548 | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 2008 | 208500 | 0 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Reg | FR2 | Veenker | 1Story | 6 | 8 | 1976 | 1976 | Gable | MetalSd | MetalSd | None | 0.0 | TA | CBlock | Gd | Gd | ALQ | 978 | 0 | 284 | 1262 | Ex | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | 1 | TA | Attchd | 1976 | RFn | 2 | 460 | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 | 181500 | 0 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | IR1 | Inside | CollgCr | 2Story | 7 | 5 | 2001 | 2002 | Gable | VinylSd | VinylSd | BrkFace | 162.0 | Gd | PConc | Gd | Mn | GLQ | 486 | 0 | 434 | 920 | Ex | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | 1 | TA | Attchd | 2001 | RFn | 2 | 608 | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 2008 | 223500 | 1 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | IR1 | Corner | Crawfor | 2Story | 7 | 5 | 1915 | 1970 | Gable | Wd Sdng | Wd Shng | None | 0.0 | TA | BrkTil | TA | No | ALQ | 216 | 0 | 540 | 756 | Gd | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | 1 | Gd | Detchd | 1998 | Unf | 3 | 642 | 0 | 35 | 272 | 0 | 0 | 0 | 0 | 2 | 2006 | 140000 | 1 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | IR1 | FR2 | NoRidge | 2Story | 8 | 5 | 2000 | 2000 | Gable | VinylSd | VinylSd | BrkFace | 350.0 | Gd | PConc | Gd | Av | GLQ | 655 | 0 | 490 | 1145 | Ex | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | 1 | TA | Attchd | 2000 | RFn | 3 | 836 | 192 | 84 | 0 | 0 | 0 | 0 | 0 | 12 | 2008 | 250000 | 0 |
##Confirmed: no wrong entries with value '2'. Hence, 0 if not remodelled and 1 if remodelled.
df.WhetherRemodelled.value_counts()
0 764 1 696 Name: WhetherRemodelled, dtype: int64
##Derive a new variable named 'AgeofProperty' to determine the age of property since the built
def find_age_since_built(row):
return row['YrSold'] - row['YearBuilt']
df['AgeofProperty'] = df.apply(find_age_since_built, axis=1)
df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | LotShape | LotConfig | Neighborhood | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | Foundation | BsmtQual | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | HeatingQC | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | WhetherRemodelled | AgeofProperty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Reg | Inside | CollgCr | 2Story | 7 | 5 | 2003 | 2003 | Gable | VinylSd | VinylSd | BrkFace | 196.0 | Gd | PConc | Gd | No | GLQ | 706 | 0 | 150 | 856 | Ex | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | 0 | None | Attchd | 2003 | RFn | 2 | 548 | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 2008 | 208500 | 0 | 5 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Reg | FR2 | Veenker | 1Story | 6 | 8 | 1976 | 1976 | Gable | MetalSd | MetalSd | None | 0.0 | TA | CBlock | Gd | Gd | ALQ | 978 | 0 | 284 | 1262 | Ex | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | 1 | TA | Attchd | 1976 | RFn | 2 | 460 | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 | 181500 | 0 | 31 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | IR1 | Inside | CollgCr | 2Story | 7 | 5 | 2001 | 2002 | Gable | VinylSd | VinylSd | BrkFace | 162.0 | Gd | PConc | Gd | Mn | GLQ | 486 | 0 | 434 | 920 | Ex | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | 1 | TA | Attchd | 2001 | RFn | 2 | 608 | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 2008 | 223500 | 1 | 7 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | IR1 | Corner | Crawfor | 2Story | 7 | 5 | 1915 | 1970 | Gable | Wd Sdng | Wd Shng | None | 0.0 | TA | BrkTil | TA | No | ALQ | 216 | 0 | 540 | 756 | Gd | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | 1 | Gd | Detchd | 1998 | Unf | 3 | 642 | 0 | 35 | 272 | 0 | 0 | 0 | 0 | 2 | 2006 | 140000 | 1 | 91 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | IR1 | FR2 | NoRidge | 2Story | 8 | 5 | 2000 | 2000 | Gable | VinylSd | VinylSd | BrkFace | 350.0 | Gd | PConc | Gd | Av | GLQ | 655 | 0 | 490 | 1145 | Ex | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | 1 | TA | Attchd | 2000 | RFn | 3 | 836 | 192 | 84 | 0 | 0 | 0 | 0 | 0 | 12 | 2008 | 250000 | 0 | 8 |
##Dropping the raw attributes from which the new features were derived. These aren't necessary for analysis
##Dropping 'YrSold', 'GarageYrBlt', 'YearRemodAdd' and 'YearBuilt'
df = df.drop(['YearBuilt', 'YearRemodAdd', 'YrSold', 'GarageYrBlt'], axis = 1)
df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | LotShape | LotConfig | Neighborhood | HouseStyle | OverallQual | OverallCond | RoofStyle | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | Foundation | BsmtQual | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | HeatingQC | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Fireplaces | FireplaceQu | GarageType | GarageFinish | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | SalePrice | WhetherRemodelled | AgeofProperty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Reg | Inside | CollgCr | 2Story | 7 | 5 | Gable | VinylSd | VinylSd | BrkFace | 196.0 | Gd | PConc | Gd | No | GLQ | 706 | 0 | 150 | 856 | Ex | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | 0 | None | Attchd | RFn | 2 | 548 | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 208500 | 0 | 5 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Reg | FR2 | Veenker | 1Story | 6 | 8 | Gable | MetalSd | MetalSd | None | 0.0 | TA | CBlock | Gd | Gd | ALQ | 978 | 0 | 284 | 1262 | Ex | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | 1 | TA | Attchd | RFn | 2 | 460 | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 181500 | 0 | 31 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | IR1 | Inside | CollgCr | 2Story | 7 | 5 | Gable | VinylSd | VinylSd | BrkFace | 162.0 | Gd | PConc | Gd | Mn | GLQ | 486 | 0 | 434 | 920 | Ex | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | 1 | TA | Attchd | RFn | 2 | 608 | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 223500 | 1 | 7 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | IR1 | Corner | Crawfor | 2Story | 7 | 5 | Gable | Wd Sdng | Wd Shng | None | 0.0 | TA | BrkTil | TA | No | ALQ | 216 | 0 | 540 | 756 | Gd | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | 1 | Gd | Detchd | Unf | 3 | 642 | 0 | 35 | 272 | 0 | 0 | 0 | 0 | 2 | 140000 | 1 | 91 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | IR1 | FR2 | NoRidge | 2Story | 8 | 5 | Gable | VinylSd | VinylSd | BrkFace | 350.0 | Gd | PConc | Gd | Av | GLQ | 655 | 0 | 490 | 1145 | Ex | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | 1 | TA | Attchd | RFn | 3 | 836 | 192 | 84 | 0 | 0 | 0 | 0 | 0 | 12 | 250000 | 0 | 8 |
df.shape
(1460, 55)
##Removing numeric attributes that have more than 80% data associated to one single value.
def get_num_cols_imbal(data, prcnt):
df1= data.copy()
my_list= []
num_cols = df1.select_dtypes(include=['int64','float64','int32','float32'])
for col in (num_cols):
if(df1[col].value_counts().max() > int(prcnt*len(df1.index)/100)):
my_list.append(col)
return my_list
##Removing the skewed data
num_cols_to_be_removed = get_num_cols_imbal(data=df, prcnt=80)
print("Numeric columns removed:",num_cols_to_be_removed)
print("\nNumber of numeric columns removed: ", len(num_cols_to_be_removed),"\n")
##Dropping columns with skewed data (int and float type numeric variables)
df = df.drop(num_cols_to_be_removed, axis=1)
df.head()
Numeric columns removed: ['BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'] Number of numeric columns removed: 9
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | LotShape | LotConfig | Neighborhood | HouseStyle | OverallQual | OverallCond | RoofStyle | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | Foundation | BsmtQual | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtUnfSF | TotalBsmtSF | HeatingQC | 1stFlrSF | 2ndFlrSF | GrLivArea | BsmtFullBath | FullBath | HalfBath | BedroomAbvGr | KitchenQual | TotRmsAbvGrd | Fireplaces | FireplaceQu | GarageType | GarageFinish | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | MoSold | SalePrice | WhetherRemodelled | AgeofProperty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Reg | Inside | CollgCr | 2Story | 7 | 5 | Gable | VinylSd | VinylSd | BrkFace | 196.0 | Gd | PConc | Gd | No | GLQ | 706 | 150 | 856 | Ex | 856 | 854 | 1710 | 1 | 2 | 1 | 3 | Gd | 8 | 0 | None | Attchd | RFn | 2 | 548 | 0 | 61 | 2 | 208500 | 0 | 5 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Reg | FR2 | Veenker | 1Story | 6 | 8 | Gable | MetalSd | MetalSd | None | 0.0 | TA | CBlock | Gd | Gd | ALQ | 978 | 284 | 1262 | Ex | 1262 | 0 | 1262 | 0 | 2 | 0 | 3 | TA | 6 | 1 | TA | Attchd | RFn | 2 | 460 | 298 | 0 | 5 | 181500 | 0 | 31 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | IR1 | Inside | CollgCr | 2Story | 7 | 5 | Gable | VinylSd | VinylSd | BrkFace | 162.0 | Gd | PConc | Gd | Mn | GLQ | 486 | 434 | 920 | Ex | 920 | 866 | 1786 | 1 | 2 | 1 | 3 | Gd | 6 | 1 | TA | Attchd | RFn | 2 | 608 | 0 | 42 | 9 | 223500 | 1 | 7 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | IR1 | Corner | Crawfor | 2Story | 7 | 5 | Gable | Wd Sdng | Wd Shng | None | 0.0 | TA | BrkTil | TA | No | ALQ | 216 | 540 | 756 | Gd | 961 | 756 | 1717 | 1 | 1 | 0 | 3 | Gd | 7 | 1 | Gd | Detchd | Unf | 3 | 642 | 0 | 35 | 2 | 140000 | 1 | 91 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | IR1 | FR2 | NoRidge | 2Story | 8 | 5 | Gable | VinylSd | VinylSd | BrkFace | 350.0 | Gd | PConc | Gd | Av | GLQ | 655 | 490 | 1145 | Ex | 1145 | 1053 | 2198 | 1 | 2 | 1 | 4 | Gd | 9 | 1 | TA | Attchd | RFn | 3 | 836 | 192 | 84 | 12 | 250000 | 0 | 8 |
##Reheck for percentage of null values in the entire dataframe
missing_val_cols_prcnt = round(100*df.isnull().sum()/len(df.index),2)
missing = missing_val_cols_prcnt[missing_val_cols_prcnt!=0]
missing
Series([], dtype: float64)
df.shape
(1460, 46)
##Numeric cateogorical variables
numeric_cat_cols = ['MSSubClass','MoSold','OverallQual','OverallCond','BsmtFullBath','FullBath','HalfBath','BedroomAbvGr',\
'TotRmsAbvGrd','Fireplaces','GarageCars','WhetherRemodelled']
categorical_freq_prcnt_plt(df, 'MoSold','MSSubClass',0)
categorical_freq_prcnt_plt(df, 'OverallQual','OverallCond','BsmtFullBath')
categorical_freq_prcnt_plt(df, 'FullBath','HalfBath','BedroomAbvGr')
categorical_freq_prcnt_plt(df, 'TotRmsAbvGrd','Fireplaces',0)
categorical_freq_prcnt_plt(df, 'GarageCars','WhetherRemodelled',0)
##Removing 'Id' and 'MoSold' columns as they are trivial for our analysis
df = df.drop(['Id','MoSold'], axis=1)
##Check outliers
df.describe(percentiles=[0.1,0.25,0.5,0.75,0.95,0.98,0.99,1])
| MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | MasVnrArea | BsmtFinSF1 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | GrLivArea | BsmtFullBath | FullBath | HalfBath | BedroomAbvGr | TotRmsAbvGrd | Fireplaces | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | SalePrice | WhetherRemodelled | AgeofProperty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
| mean | 56.897260 | 69.863699 | 10516.828082 | 6.099315 | 5.575342 | 104.229452 | 443.639726 | 567.240411 | 1057.429452 | 1162.626712 | 346.992466 | 1515.463699 | 0.425342 | 1.565068 | 0.382877 | 2.866438 | 6.517808 | 0.613014 | 1.767123 | 472.980137 | 94.244521 | 46.660274 | 180921.195890 | 0.476712 | 36.547945 |
| std | 42.300571 | 22.027677 | 9981.264932 | 1.382997 | 1.112799 | 180.717988 | 456.098091 | 441.866955 | 438.705324 | 386.587738 | 436.528436 | 525.480383 | 0.518911 | 0.550916 | 0.502885 | 0.815778 | 1.625393 | 0.644666 | 0.747315 | 213.804841 | 125.338794 | 66.256028 | 79442.502883 | 0.499629 | 30.250152 |
| min | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 34900.000000 | 0.000000 | 0.000000 |
| 10% | 20.000000 | 49.000000 | 5000.000000 | 5.000000 | 5.000000 | 0.000000 | 0.000000 | 74.900000 | 636.900000 | 756.900000 | 0.000000 | 912.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 5.000000 | 0.000000 | 1.000000 | 240.000000 | 0.000000 | 0.000000 | 106475.000000 | 0.000000 | 1.000000 |
| 25% | 20.000000 | 60.000000 | 7553.500000 | 5.000000 | 5.000000 | 0.000000 | 0.000000 | 223.000000 | 795.750000 | 882.000000 | 0.000000 | 1129.500000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 5.000000 | 0.000000 | 1.000000 | 334.500000 | 0.000000 | 0.000000 | 129975.000000 | 0.000000 | 8.000000 |
| 50% | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 0.000000 | 383.500000 | 477.500000 | 991.500000 | 1087.000000 | 0.000000 | 1464.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 6.000000 | 1.000000 | 2.000000 | 480.000000 | 0.000000 | 25.000000 | 163000.000000 | 0.000000 | 35.000000 |
| 75% | 70.000000 | 79.000000 | 11601.500000 | 7.000000 | 6.000000 | 168.000000 | 712.250000 | 808.000000 | 1298.250000 | 1391.250000 | 728.000000 | 1776.750000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 7.000000 | 1.000000 | 2.000000 | 576.000000 | 168.000000 | 68.000000 | 214000.000000 | 1.000000 | 54.000000 |
| 95% | 160.000000 | 104.000000 | 17401.150000 | 8.000000 | 8.000000 | 456.000000 | 1274.000000 | 1468.000000 | 1753.000000 | 1831.250000 | 1141.050000 | 2466.100000 | 1.000000 | 2.000000 | 1.000000 | 4.000000 | 10.000000 | 2.000000 | 3.000000 | 850.100000 | 335.000000 | 175.050000 | 326100.000000 | 1.000000 | 91.000000 |
| 98% | 188.200000 | 120.820000 | 25251.620000 | 9.000000 | 8.000000 | 650.820000 | 1442.640000 | 1678.200000 | 2001.640000 | 2072.280000 | 1318.560000 | 2782.380000 | 1.000000 | 3.000000 | 1.000000 | 4.000000 | 10.820000 | 2.000000 | 3.000000 | 907.460000 | 430.100000 | 240.820000 | 394931.060000 | 1.000000 | 100.000000 |
| 99% | 190.000000 | 137.410000 | 37567.640000 | 10.000000 | 9.000000 | 791.280000 | 1572.410000 | 1797.050000 | 2155.050000 | 2219.460000 | 1418.920000 | 3123.480000 | 2.000000 | 3.000000 | 1.000000 | 5.000000 | 11.000000 | 2.000000 | 3.000000 | 1002.790000 | 505.460000 | 285.820000 | 442567.010000 | 1.000000 | 110.410000 |
| 100% | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 1600.000000 | 5644.000000 | 2336.000000 | 6110.000000 | 4692.000000 | 2065.000000 | 5642.000000 | 3.000000 | 3.000000 | 2.000000 | 8.000000 | 14.000000 | 3.000000 | 4.000000 | 1418.000000 | 857.000000 | 547.000000 | 755000.000000 | 1.000000 | 136.000000 |
| max | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 1600.000000 | 5644.000000 | 2336.000000 | 6110.000000 | 4692.000000 | 2065.000000 | 5642.000000 | 3.000000 | 3.000000 | 2.000000 | 8.000000 | 14.000000 | 3.000000 | 4.000000 | 1418.000000 | 857.000000 | 547.000000 | 755000.000000 | 1.000000 | 136.000000 |
# Check the outliers in all the numeric columns (boxplot)
plt.figure(figsize=(20, 20))
plt.subplot(5,3,1)
sns.boxplot(y = 'LotArea', color='b', data = df)
plt.subplot(5,3,2)
sns.boxplot(y = 'TotalBsmtSF', color='b', data = df)
plt.subplot(5,3,3)
sns.boxplot(y = 'MasVnrArea', color='b', data = df)
plt.subplot(5,3,4)
sns.boxplot(y = 'OpenPorchSF', color='b', data = df)
plt.subplot(5,3,5)
sns.boxplot(y = 'WoodDeckSF', color='b', data = df)
plt.subplot(5,3,6)
sns.boxplot(y = 'GrLivArea', color='b', data = df)
plt.show()
df.shape
(1460, 44)
##Removing the Outliers
# Removing values beyond 98% for LotArea
q_LotArea = df['LotArea'].quantile(0.98)
df = df[df["LotArea"] < q_LotArea]
# Removing values beyond 99% for TotalBsmtSF
q_TotalBsmtSF = df['TotalBsmtSF'].quantile(0.99)
df = df[df["TotalBsmtSF"] < q_TotalBsmtSF]
# Removing values beyond 99% for WoodDeckSF
q_WoodDeckSF = df['WoodDeckSF'].quantile(0.99)
df = df[df["WoodDeckSF"] < q_WoodDeckSF]
# Removing values beyond 99% for OpenPorchSF
q_OpenPorchSF = df['OpenPorchSF'].quantile(0.99)
df = df[df["OpenPorchSF"] < q_OpenPorchSF]
# Determine the percentage of data retained
retained_data = round(100*(len(df)/total_records),2)
print(retained_data,"%")
94.93 %
##Visualising the target variable 'SalePrice'
from scipy import stats
from scipy.stats import norm
plt.figure(figsize=(6,8), dpi=100)
plt.subplot(211)
sns.distplot(df['SalePrice'], fit=norm)
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Plotting the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency', fontsize=10)
plt.title('SalePrice Distribution', fontsize=14, fontweight='bold')
#Get also the QQ-plot
plt.subplot(212)
res = stats.probplot(df['SalePrice'], plot=plt)
plt.autoscale()
plt.tight_layout()
plt.show()
mu = 176342.76 and sigma = 71616.92
##Tranforming the target variable 'SalePrice' using Log-Transform.
from scipy import stats
from scipy.stats import norm
##Using the numpy fuction log1p which that applies log(1+x) to all elements of the target variable column
df["SalePrice"] = np.log1p(df["SalePrice"])
plt.figure(figsize=(6,8), dpi=100)
plt.subplot(211)
sns.distplot(df['SalePrice'], fit=norm)
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Plotting the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency', fontsize=10)
plt.title("SalePrice Distribution (After Log-Transformation)", fontsize=14, fontweight='bold')
#Get also the QQ-plot
plt.subplot(212)
res = stats.probplot(df['SalePrice'], plot=plt)
plt.autoscale()
plt.tight_layout()
plt.show()
mu = 12.01 and sigma = 0.38
##Let's define a function to draw scatterplots to understand relationship between 'dependent' and 'numeric' variables.
#(For three scatterplots together)
def scatter_plt_3(a, b, c):
plt.figure(figsize=(15,5), dpi=150)
sns.set_style("white")
#subplot (131)
plt.subplot(1,3,1)
plt.scatter(x= df[a], y= df['SalePrice'])
plt.title(("Sale price vs "+a), fontweight='bold', fontsize=15)
plt.xlabel(a, fontsize=13, fontstyle='italic')
plt.ylabel('SalePrice', fontsize=13, fontstyle='italic')
plt.grid(True)
#subplot (132)
plt.subplot(1,3,2)
plt.scatter(x= df[b], y= df['SalePrice'])
plt.title(("Sale price vs "+b), fontweight='bold', fontsize=15)
plt.xlabel(b, fontsize=13, fontstyle='italic')
plt.ylabel('SalePrice', fontsize=13, fontstyle='italic')
plt.grid(True)
#subplot (133)
plt.subplot(1,3,3)
plt.scatter(x= df[c], y= df['SalePrice'])
plt.title(("Sale price vs "+c), fontweight='bold', fontsize=15)
plt.xlabel(c, fontsize=13, fontstyle='italic')
plt.ylabel('SalePrice', fontsize=13, fontstyle='italic')
plt.grid(True)
plt.autoscale()
plt.tight_layout()
plt.show()
##(For two scatterplots together)
def scatter_plt_2(a,b):
sns.set_style("white")
plt.figure(figsize=(10,5), dpi=150)
#subplot(121)
plt.subplot(121)
plt.scatter(x= df[a], y= df['SalePrice'])
plt.title(("Sale Price vs "+a), fontweight='bold', fontsize=15)
plt.xlabel(a, fontsize=13, fontstyle='italic')
plt.ylabel('SalePrice', fontsize=13, fontstyle='italic')
plt.grid(True)
#subplot(122)
plt.subplot(122)
plt.scatter(x= df[b], y= df['SalePrice'])
plt.title(("Sale price vs "+b), fontweight='bold', fontsize=15)
plt.xlabel(b, fontsize=13, fontstyle='italic')
plt.ylabel('SalePrice', fontsize=13, fontstyle='italic')
plt.grid(True)
plt.autoscale()
plt.tight_layout()
plt.show()
scatter_plt_3('LotFrontage', 'LotArea','TotalBsmtSF')
scatter_plt_3('MasVnrArea','BsmtFinSF1', 'BsmtUnfSF')
scatter_plt_3('1stFlrSF','2ndFlrSF', 'GrLivArea')
scatter_plt_2( 'GarageArea', 'WoodDeckSF')
scatter_plt_2('OpenPorchSF', 'AgeofProperty')
##Based on the 'LotFrontage', 'MasVnrArea', '1stFlrSF', and 'GarageArea' scatterplots w.r.t. SalePrice, eliminating some outlier datapoints
df = df.loc[~((df['SalePrice']<11.0) & (df['LotFrontage']>150))]
df = df.loc[~((df['SalePrice']>12.0) & (df['MasVnrArea']>1500))]
df = df.loc[~((df['SalePrice']>12.0) & (df['1stFlrSF']>2500))]
df = df.loc[~((df['SalePrice']<12.5) & (df['GarageArea']>1200))]
## Determine the percentage of data retained in the dataset
new_retained_data = round(100*(len(df)/total_records),2)
print(new_retained_data,"%")
94.59 %
#Plotting a heatmap to understand correlation amongst numeric variables (ignore the categorical numeric ones)
numeric_vars= df.select_dtypes(include=['int32','float32','int64','float64']).columns
plt.figure(figsize = (12,10), dpi=120)
corr_matrix_new = df[numeric_vars].corr()
my_mask_1 = np.triu(np.ones_like(corr_matrix_new, dtype=np.bool))
f, ax_corr1 = plt.subplots(figsize=(15, 15), dpi=100)
ax_corr1 = sns.heatmap(corr_matrix_new, cmap= 'YlGnBu', cbar_kws={"shrink": .5}, vmin= -1, vmax=1, center=0,
square=True, mask=my_mask_1, annot=True)
plt.xticks(fontsize=12, rotation=90)
plt.yticks(fontsize=12, rotation=0)
plt.tight_layout()
plt.autoscale()
plt.show()
<Figure size 1440x1200 with 0 Axes>
##Removing the highly correlated predictors to reduce multicollinearity.
df= df.drop(['TotRmsAbvGrd', 'GarageCars', '2ndFlrSF', '1stFlrSF'], axis = 1)
df.head()
| MSSubClass | MSZoning | LotFrontage | LotArea | LotShape | LotConfig | Neighborhood | HouseStyle | OverallQual | OverallCond | RoofStyle | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | Foundation | BsmtQual | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtUnfSF | TotalBsmtSF | HeatingQC | GrLivArea | BsmtFullBath | FullBath | HalfBath | BedroomAbvGr | KitchenQual | Fireplaces | FireplaceQu | GarageType | GarageFinish | GarageArea | WoodDeckSF | OpenPorchSF | SalePrice | WhetherRemodelled | AgeofProperty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | RL | 65.0 | 8450 | Reg | Inside | CollgCr | 2Story | 7 | 5 | Gable | VinylSd | VinylSd | BrkFace | 196.0 | Gd | PConc | Gd | No | GLQ | 706 | 150 | 856 | Ex | 1710 | 1 | 2 | 1 | 3 | Gd | 0 | None | Attchd | RFn | 548 | 0 | 61 | 12.247699 | 0 | 5 |
| 1 | 20 | RL | 80.0 | 9600 | Reg | FR2 | Veenker | 1Story | 6 | 8 | Gable | MetalSd | MetalSd | None | 0.0 | TA | CBlock | Gd | Gd | ALQ | 978 | 284 | 1262 | Ex | 1262 | 0 | 2 | 0 | 3 | TA | 1 | TA | Attchd | RFn | 460 | 298 | 0 | 12.109016 | 0 | 31 |
| 2 | 60 | RL | 68.0 | 11250 | IR1 | Inside | CollgCr | 2Story | 7 | 5 | Gable | VinylSd | VinylSd | BrkFace | 162.0 | Gd | PConc | Gd | Mn | GLQ | 486 | 434 | 920 | Ex | 1786 | 1 | 2 | 1 | 3 | Gd | 1 | TA | Attchd | RFn | 608 | 0 | 42 | 12.317171 | 1 | 7 |
| 3 | 70 | RL | 60.0 | 9550 | IR1 | Corner | Crawfor | 2Story | 7 | 5 | Gable | Wd Sdng | Wd Shng | None | 0.0 | TA | BrkTil | TA | No | ALQ | 216 | 540 | 756 | Gd | 1717 | 1 | 1 | 0 | 3 | Gd | 1 | Gd | Detchd | Unf | 642 | 0 | 35 | 11.849405 | 1 | 91 |
| 4 | 60 | RL | 84.0 | 14260 | IR1 | FR2 | NoRidge | 2Story | 8 | 5 | Gable | VinylSd | VinylSd | BrkFace | 350.0 | Gd | PConc | Gd | Av | GLQ | 655 | 490 | 1145 | Ex | 2198 | 1 | 2 | 1 | 4 | Gd | 1 | TA | Attchd | RFn | 836 | 192 | 84 | 12.429220 | 0 | 8 |
#Rechecking the heatmap to understand correlation amongst all numeric variables (ignore the categorical numeric ones)
numeric_vars1= df.select_dtypes(include=['int32','float32','int64','float64']).columns
plt.figure(figsize = (12,10), dpi=120)
corr_matrix_new1 = df[numeric_vars1].corr()
my_mask_2 = np.triu(np.ones_like(corr_matrix_new1, dtype=np.bool))
f, ax_corr2 = plt.subplots(figsize=(15, 15), dpi=100)
ax_corr2 = sns.heatmap(corr_matrix_new1, cmap= 'YlGnBu', cbar_kws={"shrink": .5}, vmin= -1, vmax=1, center=0,
square=True, mask=my_mask_2, annot=True)
plt.xticks(fontsize=12, rotation=90)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.autoscale()
plt.show()
<Figure size 1440x1200 with 0 Axes>
#Housing Dataframe Shape Check
df.shape
(1381, 40)
# Since the values of the following fields are ordered list, we shall assign values to them in sequence
# For values which can be ordered, we have given an ordered sequence value
# For values which cannot be ordered, we have categorised them into 0 and 1
##For ordinals, let's assign them their respective weights.
df['d_LotShape'] = df['LotShape'].map({'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0}) #3 indicates better i.e. Regular LotShape
df['d_ExterQual'] = df['ExterQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0 })
df['d_BsmtQual'] = df['BsmtQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
df['d_BsmtExposure'] = df['BsmtExposure'].map({'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0})
df['d_BsmtFinType1'] = df['BsmtFinType1'].map({'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None': 0})
df['d_HeatingQC'] = df['HeatingQC'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
df['d_KitchenQual'] = df['KitchenQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
df['d_FireplaceQu'] = df['FireplaceQu'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0})
df['d_GarageFinish'] = df['GarageFinish'].map({'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0 })
df = df.drop(['LotShape', 'ExterQual', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC', \
'KitchenQual', 'FireplaceQu', 'GarageFinish' ], axis=1)
df.head()
| MSSubClass | MSZoning | LotFrontage | LotArea | LotConfig | Neighborhood | HouseStyle | OverallQual | OverallCond | RoofStyle | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | Foundation | BsmtFinSF1 | BsmtUnfSF | TotalBsmtSF | GrLivArea | BsmtFullBath | FullBath | HalfBath | BedroomAbvGr | Fireplaces | GarageType | GarageArea | WoodDeckSF | OpenPorchSF | SalePrice | WhetherRemodelled | AgeofProperty | d_LotShape | d_ExterQual | d_BsmtQual | d_BsmtExposure | d_BsmtFinType1 | d_HeatingQC | d_KitchenQual | d_FireplaceQu | d_GarageFinish | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | RL | 65.0 | 8450 | Inside | CollgCr | 2Story | 7 | 5 | Gable | VinylSd | VinylSd | BrkFace | 196.0 | PConc | 706 | 150 | 856 | 1710 | 1 | 2 | 1 | 3 | 0 | Attchd | 548 | 0 | 61 | 12.247699 | 0 | 5 | 3 | 4 | 4 | 1 | 6 | 5 | 4 | 0 | 2 |
| 1 | 20 | RL | 80.0 | 9600 | FR2 | Veenker | 1Story | 6 | 8 | Gable | MetalSd | MetalSd | None | 0.0 | CBlock | 978 | 284 | 1262 | 1262 | 0 | 2 | 0 | 3 | 1 | Attchd | 460 | 298 | 0 | 12.109016 | 0 | 31 | 3 | 3 | 4 | 4 | 5 | 5 | 3 | 3 | 2 |
| 2 | 60 | RL | 68.0 | 11250 | Inside | CollgCr | 2Story | 7 | 5 | Gable | VinylSd | VinylSd | BrkFace | 162.0 | PConc | 486 | 434 | 920 | 1786 | 1 | 2 | 1 | 3 | 1 | Attchd | 608 | 0 | 42 | 12.317171 | 1 | 7 | 2 | 4 | 4 | 2 | 6 | 5 | 4 | 3 | 2 |
| 3 | 70 | RL | 60.0 | 9550 | Corner | Crawfor | 2Story | 7 | 5 | Gable | Wd Sdng | Wd Shng | None | 0.0 | BrkTil | 216 | 540 | 756 | 1717 | 1 | 1 | 0 | 3 | 1 | Detchd | 642 | 0 | 35 | 11.849405 | 1 | 91 | 2 | 3 | 3 | 1 | 5 | 4 | 4 | 4 | 1 |
| 4 | 60 | RL | 84.0 | 14260 | FR2 | NoRidge | 2Story | 8 | 5 | Gable | VinylSd | VinylSd | BrkFace | 350.0 | PConc | 655 | 490 | 1145 | 2198 | 1 | 2 | 1 | 4 | 1 | Attchd | 836 | 192 | 84 | 12.429220 | 0 | 8 | 2 | 4 | 4 | 3 | 6 | 5 | 4 | 3 | 2 |
#Creating dummies for nominal variables
##Creating dummies for 'MSSubClass'
d_MSSubClass = pd.get_dummies(df['MSSubClass'], prefix='MSSubClass', drop_first = True)
df = pd.concat([df, d_MSSubClass], axis = 1)
##Creating dummies for 'LotConfigs'
d_LotConfigs = pd.get_dummies(df['LotConfig'], prefix='LotConfig', drop_first = True)
df = pd.concat([df, d_LotConfigs], axis = 1)
##Creating dummies for 'MSZoning'
d_MSZoning = pd.get_dummies(df['MSZoning'], prefix='MSZoning', drop_first = True)
df = pd.concat([df, d_MSZoning], axis = 1)
##Creating dummies for 'Neighborhood'
d_Neighborhood = pd.get_dummies(df['Neighborhood'], prefix='Neighborhood', drop_first = True)
df = pd.concat([df, d_Neighborhood], axis = 1)
##Creating dummies for 'Exterior2nd'
d_Exterior2nd = pd.get_dummies(df['Exterior2nd'], prefix='Exterior2nd', drop_first = True)
df = pd.concat([df, d_Exterior2nd], axis = 1)
##Creating dummies for 'HouseStyle'
d_HouseStyle = pd.get_dummies(df['HouseStyle'], prefix='HouseStyle', drop_first = True)
df = pd.concat([df, d_HouseStyle], axis = 1)
##Creating dummies for 'Foundation'
d_Foundation = pd.get_dummies(df['Foundation'], prefix='Foundation', drop_first = True)
df = pd.concat([df, d_Foundation], axis = 1)
##Creating dummies for 'MasVnrType'
d_MasVnrType = pd.get_dummies(df['MasVnrType'], prefix='MasVnrTyp', drop_first = True)
df = pd.concat([df, d_MasVnrType], axis = 1)
##Creating dummies for 'RoofStyle'
d_RoofStyle = pd.get_dummies(df['RoofStyle'], prefix='RoofStyle', drop_first = True)
df = pd.concat([df, d_RoofStyle], axis = 1)
##Creating dummies for 'Exterior1st'
d_Exterior1st = pd.get_dummies(df['Exterior1st'], prefix='Exterior1st', drop_first = True)
df = pd.concat([df, d_Exterior1st], axis = 1)
##Creating dummies for 'GarageType'
d_GarageType = pd.get_dummies(df['GarageType'], prefix='GarageType', drop_first = True)
df = pd.concat([df, d_GarageType], axis = 1)
##Dropping columns for which we have created dummies, as we now have new columns derived from these columns
orig_before_dum_cols = ['MSSubClass','MSZoning', 'Neighborhood', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'Foundation', \
'GarageType', 'LotConfig' ,'HouseStyle', 'MasVnrType']
df = df.drop(orig_before_dum_cols, axis=1)
df.head()
| LotFrontage | LotArea | OverallQual | OverallCond | MasVnrArea | BsmtFinSF1 | BsmtUnfSF | TotalBsmtSF | GrLivArea | BsmtFullBath | FullBath | HalfBath | BedroomAbvGr | Fireplaces | GarageArea | WoodDeckSF | OpenPorchSF | SalePrice | WhetherRemodelled | AgeofProperty | d_LotShape | d_ExterQual | d_BsmtQual | d_BsmtExposure | d_BsmtFinType1 | d_HeatingQC | d_KitchenQual | d_FireplaceQu | d_GarageFinish | MSSubClass_30 | MSSubClass_40 | MSSubClass_45 | MSSubClass_50 | MSSubClass_60 | MSSubClass_70 | MSSubClass_75 | MSSubClass_80 | MSSubClass_85 | MSSubClass_90 | MSSubClass_120 | MSSubClass_160 | MSSubClass_180 | MSSubClass_190 | LotConfig_CulDSac | LotConfig_FR2 | LotConfig_FR3 | LotConfig_Inside | MSZoning_FV | MSZoning_RH | MSZoning_RL | MSZoning_RM | Neighborhood_Blueste | Neighborhood_BrDale | Neighborhood_BrkSide | Neighborhood_ClearCr | Neighborhood_CollgCr | Neighborhood_Crawfor | Neighborhood_Edwards | Neighborhood_Gilbert | Neighborhood_IDOTRR | Neighborhood_MeadowV | Neighborhood_Mitchel | Neighborhood_NAmes | Neighborhood_NPkVill | Neighborhood_NWAmes | Neighborhood_NoRidge | Neighborhood_NridgHt | Neighborhood_OldTown | Neighborhood_SWISU | Neighborhood_Sawyer | Neighborhood_SawyerW | Neighborhood_Somerst | Neighborhood_StoneBr | Neighborhood_Timber | Neighborhood_Veenker | Exterior2nd_AsphShn | Exterior2nd_Brk Cmn | Exterior2nd_BrkFace | Exterior2nd_CBlock | Exterior2nd_CmentBd | Exterior2nd_HdBoard | Exterior2nd_ImStucc | Exterior2nd_MetalSd | Exterior2nd_Other | Exterior2nd_Plywood | Exterior2nd_Stone | Exterior2nd_Stucco | Exterior2nd_VinylSd | Exterior2nd_Wd Sdng | Exterior2nd_Wd Shng | HouseStyle_1.5Unf | HouseStyle_1Story | HouseStyle_2.5Fin | HouseStyle_2.5Unf | HouseStyle_2Story | HouseStyle_SFoyer | HouseStyle_SLvl | Foundation_CBlock | Foundation_PConc | Foundation_Slab | Foundation_Stone | Foundation_Wood | MasVnrTyp_BrkFace | MasVnrTyp_None | MasVnrTyp_Stone | RoofStyle_Gable | RoofStyle_Gambrel | RoofStyle_Hip | RoofStyle_Mansard | RoofStyle_Shed | Exterior1st_AsphShn | Exterior1st_BrkComm | Exterior1st_BrkFace | Exterior1st_CBlock | Exterior1st_CemntBd | Exterior1st_HdBoard | Exterior1st_ImStucc | Exterior1st_MetalSd | Exterior1st_Plywood | Exterior1st_Stone | Exterior1st_Stucco | Exterior1st_VinylSd | Exterior1st_Wd Sdng | Exterior1st_WdShing | GarageType_Attchd | GarageType_Basment | GarageType_BuiltIn | GarageType_CarPort | GarageType_Detchd | GarageType_None | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 65.0 | 8450 | 7 | 5 | 196.0 | 706 | 150 | 856 | 1710 | 1 | 2 | 1 | 3 | 0 | 548 | 0 | 61 | 12.247699 | 0 | 5 | 3 | 4 | 4 | 1 | 6 | 5 | 4 | 0 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 1 | 80.0 | 9600 | 6 | 8 | 0.0 | 978 | 284 | 1262 | 1262 | 0 | 2 | 0 | 3 | 1 | 460 | 298 | 0 | 12.109016 | 0 | 31 | 3 | 3 | 4 | 4 | 5 | 5 | 3 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2 | 68.0 | 11250 | 7 | 5 | 162.0 | 486 | 434 | 920 | 1786 | 1 | 2 | 1 | 3 | 1 | 608 | 0 | 42 | 12.317171 | 1 | 7 | 2 | 4 | 4 | 2 | 6 | 5 | 4 | 3 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 3 | 60.0 | 9550 | 7 | 5 | 0.0 | 216 | 540 | 756 | 1717 | 1 | 1 | 0 | 3 | 1 | 642 | 0 | 35 | 11.849405 | 1 | 91 | 2 | 3 | 3 | 1 | 5 | 4 | 4 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 84.0 | 14260 | 8 | 5 | 350.0 | 655 | 490 | 1145 | 2198 | 1 | 2 | 1 | 4 | 1 | 836 | 192 | 84 | 12.429220 | 0 | 8 | 2 | 4 | 4 | 3 | 6 | 5 | 4 | 3 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
##Rechecking and eliminating those predictor variables that have a correlation of (threshold:-0.7 to 0.7) with other predictors.
df_corr = df.drop(['SalePrice'],axis =1)
my_corr_matrix = df_corr.corr().abs()
my_corr_matrix
upper_triangle = my_corr_matrix.where(np.triu(np.ones(my_corr_matrix.shape), k=1).astype(np.bool))
col_to_drop = [col for col in upper_triangle.columns if any(upper_triangle[col] > 0.70)] #Reduces multicollinearity (if any)
col_to_drop #columns to drop from df. Independent predictors highly correlated to each other
['d_ExterQual', 'd_BsmtFinType1', 'd_KitchenQual', 'd_FireplaceQu', 'MSZoning_RM', 'Neighborhood_Somerst', 'HouseStyle_1.5Unf', 'HouseStyle_2Story', 'HouseStyle_SFoyer', 'HouseStyle_SLvl', 'Foundation_PConc', 'MasVnrTyp_None', 'RoofStyle_Hip', 'Exterior1st_CBlock', 'Exterior1st_CemntBd', 'Exterior1st_HdBoard', 'Exterior1st_MetalSd', 'Exterior1st_Plywood', 'Exterior1st_Stucco', 'Exterior1st_VinylSd', 'Exterior1st_Wd Sdng', 'GarageType_Detchd']
##Dropping independent predictors highly correlated to each other
df = df.drop(col_to_drop, axis=1)
#Recheck Model
df.shape
(1381, 108)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1381 entries, 0 to 1458 Columns: 108 entries, LotFrontage to GarageType_None dtypes: float64(3), int64(22), uint8(83) memory usage: 392.5 KB
##Creating a copy of this dataframe for 'Question 3'
df_new1 = df.copy()
##Creating a function to find binary value columns from the 'df' dataframe (if any)
def binary_val_cols(df):
df_1 = df.copy()
dualsvcol = (df_1.nunique()==2)
list_dualsvcol = list(dualsvcol[dualsvcol.values==True].index)
return list_dualsvcol
binary_cols = binary_val_cols(df)
print(binary_cols)
['WhetherRemodelled', 'MSSubClass_30', 'MSSubClass_40', 'MSSubClass_45', 'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_70', 'MSSubClass_75', 'MSSubClass_80', 'MSSubClass_85', 'MSSubClass_90', 'MSSubClass_120', 'MSSubClass_160', 'MSSubClass_180', 'MSSubClass_190', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_IDOTRR', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker', 'Exterior2nd_AsphShn', 'Exterior2nd_Brk Cmn', 'Exterior2nd_BrkFace', 'Exterior2nd_CBlock', 'Exterior2nd_CmentBd', 'Exterior2nd_HdBoard', 'Exterior2nd_ImStucc', 'Exterior2nd_MetalSd', 'Exterior2nd_Other', 'Exterior2nd_Plywood', 'Exterior2nd_Stone', 'Exterior2nd_Stucco', 'Exterior2nd_VinylSd', 'Exterior2nd_Wd Sdng', 'Exterior2nd_Wd Shng', 'HouseStyle_1Story', 'HouseStyle_2.5Fin', 'HouseStyle_2.5Unf', 'Foundation_CBlock', 'Foundation_Slab', 'Foundation_Stone', 'Foundation_Wood', 'MasVnrTyp_BrkFace', 'MasVnrTyp_Stone', 'RoofStyle_Gable', 'RoofStyle_Gambrel', 'RoofStyle_Mansard', 'RoofStyle_Shed', 'Exterior1st_AsphShn', 'Exterior1st_BrkComm', 'Exterior1st_BrkFace', 'Exterior1st_ImStucc', 'Exterior1st_Stone', 'Exterior1st_WdShing', 'GarageType_Attchd', 'GarageType_Basment', 'GarageType_BuiltIn', 'GarageType_CarPort', 'GarageType_None']
##split into train and test
from sklearn.model_selection import train_test_split
np.random.seed(0)
df_train, df_test = train_test_split(df, train_size=0.7, test_size = 0.3, random_state=100)
##Dataframe with binary columns
df_binary_train = df_train.loc[:, binary_cols]
df_binary_test = df_test.loc[:, binary_cols]
##Dropping binary dummy variables and we shall concat them later to preserve the scale
df_train = df_train.drop(binary_cols, axis=1)
df_test = df_test.drop(binary_cols, axis=1)
##StandardScaler
from sklearn.preprocessing import StandardScaler
all_cols = df_train.columns
scaler = StandardScaler()
#scaler fit_transform on train data
df_train[all_cols] = scaler.fit_transform(df_train[all_cols])
#concat dummies:Train set
df_train = pd.concat([df_train, df_binary_train], axis=1)
#scaler transform on test data
df_test[all_cols] = scaler.transform(df_test[all_cols])
#concat dummies: Test set
df_test = pd.concat([df_test, df_binary_test], axis=1)
print(df_train.shape)
print(df_test.shape)
(966, 108) (415, 108)
##Storing target variable to y_train and y_test respectively
y_train = df_train['SalePrice']
y_test = df_test['SalePrice']
##Storing all feature variables to X_train and X_test
X_train = df_train.drop('SalePrice',axis=1)
X_test = df_test.drop('SalePrice',axis=1)
##Running RFE with the output number of the variable equal to 50
lm=LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm,50) # running RFE
rfe = rfe.fit(X_train, y_train)
##my_zip file zips features, rfe.support_ and rfe.ranking_
my_zip = list(zip(X_train.columns,rfe.support_,rfe.ranking_))
my_zip
[('LotFrontage', False, 8),
('LotArea', False, 11),
('OverallQual', True, 1),
('OverallCond', True, 1),
('MasVnrArea', False, 49),
('BsmtFinSF1', True, 1),
('BsmtUnfSF', False, 41),
('TotalBsmtSF', True, 1),
('GrLivArea', True, 1),
('BsmtFullBath', False, 23),
('FullBath', False, 40),
('HalfBath', False, 36),
('BedroomAbvGr', False, 44),
('Fireplaces', False, 27),
('GarageArea', True, 1),
('WoodDeckSF', False, 24),
('OpenPorchSF', False, 48),
('AgeofProperty', True, 1),
('d_LotShape', False, 51),
('d_BsmtQual', False, 18),
('d_BsmtExposure', False, 31),
('d_HeatingQC', False, 28),
('d_GarageFinish', False, 53),
('WhetherRemodelled', False, 52),
('MSSubClass_30', False, 4),
('MSSubClass_40', False, 47),
('MSSubClass_45', True, 1),
('MSSubClass_50', False, 30),
('MSSubClass_60', False, 54),
('MSSubClass_70', True, 1),
('MSSubClass_75', True, 1),
('MSSubClass_80', False, 34),
('MSSubClass_85', False, 38),
('MSSubClass_90', True, 1),
('MSSubClass_120', False, 21),
('MSSubClass_160', True, 1),
('MSSubClass_180', False, 16),
('MSSubClass_190', False, 6),
('LotConfig_CulDSac', True, 1),
('LotConfig_FR2', False, 25),
('LotConfig_FR3', False, 22),
('LotConfig_Inside', False, 26),
('MSZoning_FV', True, 1),
('MSZoning_RH', False, 29),
('MSZoning_RL', True, 1),
('Neighborhood_Blueste', False, 17),
('Neighborhood_BrDale', True, 1),
('Neighborhood_BrkSide', False, 45),
('Neighborhood_ClearCr', False, 12),
('Neighborhood_CollgCr', False, 3),
('Neighborhood_Crawfor', True, 1),
('Neighborhood_Edwards', True, 1),
('Neighborhood_Gilbert', False, 14),
('Neighborhood_IDOTRR', True, 1),
('Neighborhood_MeadowV', True, 1),
('Neighborhood_Mitchel', True, 1),
('Neighborhood_NAmes', True, 1),
('Neighborhood_NPkVill', True, 1),
('Neighborhood_NWAmes', True, 1),
('Neighborhood_NoRidge', False, 13),
('Neighborhood_NridgHt', True, 1),
('Neighborhood_OldTown', True, 1),
('Neighborhood_SWISU', False, 20),
('Neighborhood_Sawyer', True, 1),
('Neighborhood_SawyerW', False, 2),
('Neighborhood_StoneBr', True, 1),
('Neighborhood_Timber', False, 15),
('Neighborhood_Veenker', False, 32),
('Exterior2nd_AsphShn', True, 1),
('Exterior2nd_Brk Cmn', True, 1),
('Exterior2nd_BrkFace', True, 1),
('Exterior2nd_CBlock', False, 43),
('Exterior2nd_CmentBd', True, 1),
('Exterior2nd_HdBoard', True, 1),
('Exterior2nd_ImStucc', True, 1),
('Exterior2nd_MetalSd', True, 1),
('Exterior2nd_Other', False, 55),
('Exterior2nd_Plywood', True, 1),
('Exterior2nd_Stone', True, 1),
('Exterior2nd_Stucco', True, 1),
('Exterior2nd_VinylSd', True, 1),
('Exterior2nd_Wd Sdng', True, 1),
('Exterior2nd_Wd Shng', False, 39),
('HouseStyle_1Story', False, 33),
('HouseStyle_2.5Fin', True, 1),
('HouseStyle_2.5Unf', True, 1),
('Foundation_CBlock', False, 42),
('Foundation_Slab', False, 9),
('Foundation_Stone', True, 1),
('Foundation_Wood', True, 1),
('MasVnrTyp_BrkFace', False, 56),
('MasVnrTyp_Stone', False, 5),
('RoofStyle_Gable', False, 37),
('RoofStyle_Gambrel', True, 1),
('RoofStyle_Mansard', False, 50),
('RoofStyle_Shed', False, 35),
('Exterior1st_AsphShn', False, 57),
('Exterior1st_BrkComm', True, 1),
('Exterior1st_BrkFace', True, 1),
('Exterior1st_ImStucc', False, 58),
('Exterior1st_Stone', False, 19),
('Exterior1st_WdShing', True, 1),
('GarageType_Attchd', False, 46),
('GarageType_Basment', True, 1),
('GarageType_BuiltIn', False, 10),
('GarageType_CarPort', False, 7),
('GarageType_None', True, 1)]
##Checking columns that have RFE support
col_rfe_sup = X_train.columns[rfe.support_]
col_rfe_sup
Index(['OverallQual', 'OverallCond', 'BsmtFinSF1', 'TotalBsmtSF', 'GrLivArea',
'GarageArea', 'AgeofProperty', 'MSSubClass_45', 'MSSubClass_70',
'MSSubClass_75', 'MSSubClass_90', 'MSSubClass_160', 'LotConfig_CulDSac',
'MSZoning_FV', 'MSZoning_RL', 'Neighborhood_BrDale',
'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_IDOTRR',
'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes',
'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NridgHt',
'Neighborhood_OldTown', 'Neighborhood_Sawyer', 'Neighborhood_StoneBr',
'Exterior2nd_AsphShn', 'Exterior2nd_Brk Cmn', 'Exterior2nd_BrkFace',
'Exterior2nd_CmentBd', 'Exterior2nd_HdBoard', 'Exterior2nd_ImStucc',
'Exterior2nd_MetalSd', 'Exterior2nd_Plywood', 'Exterior2nd_Stone',
'Exterior2nd_Stucco', 'Exterior2nd_VinylSd', 'Exterior2nd_Wd Sdng',
'HouseStyle_2.5Fin', 'HouseStyle_2.5Unf', 'Foundation_Stone',
'Foundation_Wood', 'RoofStyle_Gambrel', 'Exterior1st_BrkComm',
'Exterior1st_BrkFace', 'Exterior1st_WdShing', 'GarageType_Basment',
'GarageType_None'],
dtype='object')
##Creating a dataframe for RFE supported top 50 indepedent variables. Assign the 50 features selected using RFE to a dataframe and view them
top50_df = pd.DataFrame(my_zip, columns=['Features', 'rfe_support', 'rfe_ranking'])
top50_df = top50_df.loc[top50_df['rfe_support'] == True]
top50_df.reset_index(drop=True, inplace=True)
top50_df
| Features | rfe_support | rfe_ranking | |
|---|---|---|---|
| 0 | OverallQual | True | 1 |
| 1 | OverallCond | True | 1 |
| 2 | BsmtFinSF1 | True | 1 |
| 3 | TotalBsmtSF | True | 1 |
| 4 | GrLivArea | True | 1 |
| 5 | GarageArea | True | 1 |
| 6 | AgeofProperty | True | 1 |
| 7 | MSSubClass_45 | True | 1 |
| 8 | MSSubClass_70 | True | 1 |
| 9 | MSSubClass_75 | True | 1 |
| 10 | MSSubClass_90 | True | 1 |
| 11 | MSSubClass_160 | True | 1 |
| 12 | LotConfig_CulDSac | True | 1 |
| 13 | MSZoning_FV | True | 1 |
| 14 | MSZoning_RL | True | 1 |
| 15 | Neighborhood_BrDale | True | 1 |
| 16 | Neighborhood_Crawfor | True | 1 |
| 17 | Neighborhood_Edwards | True | 1 |
| 18 | Neighborhood_IDOTRR | True | 1 |
| 19 | Neighborhood_MeadowV | True | 1 |
| 20 | Neighborhood_Mitchel | True | 1 |
| 21 | Neighborhood_NAmes | True | 1 |
| 22 | Neighborhood_NPkVill | True | 1 |
| 23 | Neighborhood_NWAmes | True | 1 |
| 24 | Neighborhood_NridgHt | True | 1 |
| 25 | Neighborhood_OldTown | True | 1 |
| 26 | Neighborhood_Sawyer | True | 1 |
| 27 | Neighborhood_StoneBr | True | 1 |
| 28 | Exterior2nd_AsphShn | True | 1 |
| 29 | Exterior2nd_Brk Cmn | True | 1 |
| 30 | Exterior2nd_BrkFace | True | 1 |
| 31 | Exterior2nd_CmentBd | True | 1 |
| 32 | Exterior2nd_HdBoard | True | 1 |
| 33 | Exterior2nd_ImStucc | True | 1 |
| 34 | Exterior2nd_MetalSd | True | 1 |
| 35 | Exterior2nd_Plywood | True | 1 |
| 36 | Exterior2nd_Stone | True | 1 |
| 37 | Exterior2nd_Stucco | True | 1 |
| 38 | Exterior2nd_VinylSd | True | 1 |
| 39 | Exterior2nd_Wd Sdng | True | 1 |
| 40 | HouseStyle_2.5Fin | True | 1 |
| 41 | HouseStyle_2.5Unf | True | 1 |
| 42 | Foundation_Stone | True | 1 |
| 43 | Foundation_Wood | True | 1 |
| 44 | RoofStyle_Gambrel | True | 1 |
| 45 | Exterior1st_BrkComm | True | 1 |
| 46 | Exterior1st_BrkFace | True | 1 |
| 47 | Exterior1st_WdShing | True | 1 |
| 48 | GarageType_Basment | True | 1 |
| 49 | GarageType_None | True | 1 |
##Let's Assign top 50 columns to X_train_rfe
X_train_rfe = X_train[col_rfe_sup]
##Making sure that we have only 50 features (supported by RFE) in X_train and X_test for further analysis
X_train = X_train_rfe[X_train_rfe.columns]
X_test = X_test[X_train.columns]
## list of alphas (lambda parameter): Ridge Regression Model
params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
9.0, 10.0,11,12,13,14,15,16,17,18,19,20, 50, 100, 500, 1000 ]}
ridge = Ridge()
##Cross-Validation
folds = 5
ridge_model_cv = GridSearchCV(estimator = ridge,
param_grid = params,
scoring= 'neg_mean_absolute_error',
cv = folds,
return_train_score=True,
verbose = 1)
ridge_model_cv.fit(X_train, y_train)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
GridSearchCV(cv=5, estimator=Ridge(),
param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0,
6.0, 7.0, 8.0, 9.0, 10.0, 11, 12, 13, 14, 15,
16, 17, 18, ...]},
return_train_score=True, scoring='neg_mean_absolute_error',
verbose=1)
##Display Mean Scores
ridge_cv_results = pd.DataFrame(ridge_model_cv.cv_results_)
ridge_cv_results[['param_alpha', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])
| param_alpha | mean_train_score | mean_test_score | rank_test_score | |
|---|---|---|---|---|
| 19 | 8.0 | -0.210048 | -0.221605 | 1 |
| 20 | 9.0 | -0.210438 | -0.221618 | 2 |
| 18 | 7.0 | -0.209694 | -0.221651 | 3 |
| 21 | 10.0 | -0.210843 | -0.221668 | 4 |
| 17 | 6.0 | -0.209364 | -0.221756 | 5 |
| 22 | 11 | -0.211257 | -0.221771 | 6 |
| 23 | 12 | -0.211672 | -0.221893 | 7 |
| 16 | 5.0 | -0.209054 | -0.221951 | 8 |
| 24 | 13 | -0.212088 | -0.222058 | 9 |
| 25 | 14 | -0.212509 | -0.222276 | 10 |
| 15 | 4.0 | -0.208787 | -0.222286 | 11 |
| 26 | 15 | -0.212934 | -0.222512 | 12 |
| 14 | 3.0 | -0.208553 | -0.222677 | 13 |
| 27 | 16 | -0.213356 | -0.222756 | 14 |
| 28 | 17 | -0.213775 | -0.223011 | 15 |
| 29 | 18 | -0.214187 | -0.223284 | 16 |
| 13 | 2.0 | -0.208301 | -0.223370 | 17 |
| 30 | 19 | -0.214592 | -0.223573 | 18 |
| 31 | 20 | -0.214986 | -0.223858 | 19 |
| 12 | 1.0 | -0.207955 | -0.224414 | 20 |
| 11 | 0.9 | -0.207898 | -0.224574 | 21 |
| 10 | 0.8 | -0.207833 | -0.224740 | 22 |
| 9 | 0.7 | -0.207759 | -0.224913 | 23 |
| 8 | 0.6 | -0.207676 | -0.225105 | 24 |
| 7 | 0.5 | -0.207590 | -0.225322 | 25 |
| 6 | 0.4 | -0.207499 | -0.225570 | 26 |
| 5 | 0.3 | -0.207394 | -0.225826 | 27 |
| 4 | 0.2 | -0.207256 | -0.226149 | 28 |
| 3 | 0.1 | -0.207088 | -0.226548 | 29 |
| 2 | 0.01 | -0.206984 | -0.227116 | 30 |
| 1 | 0.001 | -0.206974 | -0.227185 | 31 |
| 0 | 0.0001 | -0.206973 | -0.227192 | 32 |
| 32 | 50 | -0.223374 | -0.230496 | 33 |
| 33 | 100 | -0.231097 | -0.236599 | 34 |
| 34 | 500 | -0.274400 | -0.276676 | 35 |
| 35 | 1000 | -0.331544 | -0.333312 | 36 |
##Plot mean test and train scores with alpha
ridge_cv_results['param_alpha'] = ridge_cv_results['param_alpha'].astype('int32')
# plotting
plt.plot(ridge_cv_results['param_alpha'], ridge_cv_results['mean_train_score'])
plt.plot(ridge_cv_results['param_alpha'], ridge_cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper right')
plt.show()
##Finding the best estimator for alpha (lambda parameter)
ridge_model_cv.best_estimator_
Ridge(alpha=8.0)
##Checking the coefficient values with lambda (alpha) = 8
ridge = Ridge(alpha=8)
ridge.fit(X_train, y_train)
print("Intercept: ", ridge.intercept_)
print("Coefficients:\n",ridge.coef_)
Intercept: -0.14279634059307378 Coefficients: [ 0.22811418 0.15817642 0.10680401 0.14375568 0.36228716 0.10366948 -0.28671345 0.0424148 0.17726922 0.0177001 -0.16492446 -0.23794446 0.09267265 0.2507963 0.1733638 -0.08916825 0.17653754 -0.06475937 -0.13721268 -0.16468068 -0.12735844 -0.09626932 0.00587146 -0.09345794 0.14509487 -0.11712751 -0.08589522 0.16852743 0.02925293 -0.0735141 0.08174668 0.04774744 -0.0142989 -0.02273986 0.04678954 -0.0030133 0.09018745 0.06898265 0.05626053 0.0769415 -0.05026162 -0.04175746 0.07324564 -0.08673259 0.07512959 -0.13353797 0.14108911 0.04524149 -0.08258248 -0.1168248 ]
##Making predictions for train and test sets: Ridge Regression Model
y_pred_train_r = ridge.predict(X_train)
y_pred_test_r = ridge.predict(X_test)
##R2 score for Ridge Regression Model
r2_score_ridge_train = r2_score(y_true= y_train, y_pred= y_pred_train_r)
r2_score_ridge_test = r2_score(y_true= y_test, y_pred= y_pred_test_r)
##Check the mean squared error (MSE) for Ridge Regression Model
MSE_ridge_train = mean_squared_error(y_train, y_pred_train_r)
MSE_ridge_test = mean_squared_error(y_test, y_pred_test_r)
##Mean Absolute error for train and test sets
MAE_ridge_train = mean_absolute_error(y_train, y_pred_train_r)
MAE_ridge_test = mean_absolute_error(y_test, y_pred_test_r)
##Root Mean Squared Error for Train and Test Sets
RMSE_ridge_train = np.sqrt(MSE_ridge_train)
RMSE_ridge_test = np.sqrt(MSE_ridge_test)
print("For Ridge Regression Model (Original Model, alpha=8.0):\n","*"*40)
print("\nFor Train Set:\nR2 score:",r2_score_ridge_train,"\nMSE score:",MSE_ridge_train,"\nMAE score:",MAE_ridge_train,\
"\nRMSE score:",RMSE_ridge_train)
print("\nFor Test Set:\nR2 score:",r2_score_ridge_test,"\nMSE score:",MSE_ridge_test,"\nMAE score:",MAE_ridge_test,\
"\nRMSE score:",RMSE_ridge_test,"\n","*"*40)
For Ridge Regression Model (Original Model, alpha=8.0): **************************************** For Train Set: R2 score: 0.9141662215779705 MSE score: 0.08583377842202958 MAE score: 0.21040903665829555 RMSE score: 0.29297402345946916 For Test Set: R2 score: 0.8911232017492551 MSE score: 0.10554888376965178 MAE score: 0.21666630472566778 RMSE score: 0.32488287700285434 ****************************************
##Creating a dataframe of features and coefficients
ridge_df = pd.DataFrame({'Features':X_train.columns, 'Coefficient':ridge.coef_.round(4),
'Abs_Coefficient_Ridge(Desc_Sort)':abs(ridge.coef_.round(4))})
##Sorting coefficient in descending order of absolute values and reset index
ridge_df = ridge_df.sort_values(by='Abs_Coefficient_Ridge(Desc_Sort)', ascending=False)
ridge_df.reset_index(drop=True, inplace=True)
#Dataframe rdige_df
ridge_df.head(10) #Top10
| Features | Coefficient | Abs_Coefficient_Ridge(Desc_Sort) | |
|---|---|---|---|
| 0 | GrLivArea | 0.3623 | 0.3623 |
| 1 | AgeofProperty | -0.2867 | 0.2867 |
| 2 | MSZoning_FV | 0.2508 | 0.2508 |
| 3 | MSSubClass_160 | -0.2379 | 0.2379 |
| 4 | OverallQual | 0.2281 | 0.2281 |
| 5 | MSSubClass_70 | 0.1773 | 0.1773 |
| 6 | Neighborhood_Crawfor | 0.1765 | 0.1765 |
| 7 | MSZoning_RL | 0.1734 | 0.1734 |
| 8 | Neighborhood_StoneBr | 0.1685 | 0.1685 |
| 9 | MSSubClass_90 | -0.1649 | 0.1649 |
##Coefficient value plot (Ridge Regression)
top10_ridge_df= ridge_df.loc[:9] #Ridge_df with top 10 coefficients
sns.set(style='white')
plt.figure(figsize=(16,8), dpi=120)
ax3= sns.barplot(y=top10_ridge_df['Features'], x=top10_ridge_df['Coefficient'], palette='Set1')
plt.xlabel('Coefficient Values', fontsize= 14, fontstyle='italic')
plt.ylabel('Features' , fontsize= 14, fontstyle='italic')
plt.title('Coefficents of Top 10 Features (Ridge Regression):', fontsize=18,fontweight='bold')
coef= top10_ridge_df['Coefficient'] #Storing coefficient values
for index, value in enumerate(coef):
plt.text(value, index, str(value), fontsize=13)
plt.grid(True)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.autoscale()
plt.tight_layout()
plt.show()
The chart mentioned above displays the Top 10 predictors based on the Ridge Regression model, that are significant in predicting the sale price of the house.
Checking for the error terms distribution. They should be normally distributed (as it is one of the major assumptions of linear regression).
# Plot the histogram of the error terms
sns.set_style('white')
plt.figure(figsize=(5,5), dpi=100)
res = (y_train - y_pred_train_r) #residuals
sns.distplot(res , bins = 30, color='g')
plt.title('Error Terms: Distribution', fontweight='bold', fontsize = 18)
plt.xlabel('Errors', fontstyle='italic', fontsize = 12)
plt.grid(True)
plt.show()
Error terms seem to be approximately normally distributed with mean 0, so our assumption holds true.
# Plotting y_train and y_train_pred to understand the residuals.
sns.set_style('white')
plt.figure(figsize = (8,6))
plt.scatter(y_train,y_pred_train_r)
plt.title('y_train vs y_pred_train_r', fontweight='bold', fontsize = 20)
plt.xlabel('y_train', fontstyle='italic', fontsize = 16)
plt.ylabel('y_pred_train_r', fontstyle='italic', fontsize = 16)
plt.grid(True)
plt.show()
y_train.shape
(966,)
# Actual and Predicted (Train Set)
sns.set_style('white')
plt.figure(figsize = (10,4), dpi=120)
f = [i for i in range(0,966,1)] #Since y_train has 966 observations
g = [i for i in range(0,966,1)]
plt.plot(f, y_pred_train_r, color="blue", linewidth=1, linestyle="-") #Predicted 'y_pred_train_r' plot
plt.plot(g, y_train, color="red", linewidth=1, linestyle="-") #Actual 'y_train' plot
plt.title("Actual (y_train) and Predicted (y_pred_train_r): Train Set (Ridge)", fontsize=15, fontweight='bold')
plt.xlabel('Index', fontsize=15, fontstyle='italic')
plt.ylabel('SalePrice', fontsize=15, fontstyle='italic')
plt.show()
# Error terms for train set
sns.set_style('white')
plt.figure(figsize = (6,4), dpi=100)
j = [i for i in range(0,966,1)] #For 966 observations in the train set
res_train = (y_train-y_pred_train_r) #residuals (train set)
plt.scatter(j, res_train)
plt.title('Error Terms', fontsize=18, fontweight='bold')
plt.xlabel('Index', fontstyle='italic', fontsize=14)
plt.ylabel('y_train - y_pred_train_r', fontstyle='italic', fontsize=14)
plt.grid(True)
plt.show()
The residuals are scattered along (y=0) and are independent of each other.
# Plotting y_test and y_pred_test_r to understand the spread.
sns.set_style('white')
fig = plt.figure(figsize=(6,4), dpi=100)
plt.scatter(y_test,y_pred_test_r)
fig.suptitle('y_test vs y_pred_test_r', fontsize=18)
plt.xlabel('y_test', fontsize=14)
plt.ylabel('y_pred_test_r', fontsize=14)
plt.grid(True)
plt.show()
y_test.shape
(415,)
# Actual vs Predicted (Test Set)
plt.figure(figsize = (10,4), dpi=120)
p = [i for i in range(0,415,1)] #As test set has 415 observations
l = [i for i in range(0,415,1)]
plt.plot(p, y_pred_test_r, color="blue", linewidth=1, linestyle="-") #Predicted 'y_pred_test_r' plot(test pred)
plt.plot(l, y_test, color="red", linewidth=1, linestyle="-") #Actual 'y_test' plot
plt.title('Actual (y_test) vs Predicted (y_pred_test_r): Test Set', fontsize=15, fontweight='bold')
plt.ylabel('SalePrice', fontsize=15, fontstyle='italic')
plt.xlabel('Index', fontsize=15, fontstyle='italic')
plt.show()
Blue: Predicted (y_pred_test_r)- - - - - - - - - - - - - - Red: Actual (y_test)</b>
# Error terms
fig = plt.figure(figsize=(6,4), dpi=100)
o = [i for i in range(0,415,1)] #For 415 observations in the test set
res_test = (y_test-y_pred_test_r) #residuals test set
plt.scatter(o,res_test)
fig.suptitle('Error Terms', fontsize=20)
plt.xlabel('Index', fontsize=18)
plt.ylabel('y_test - y_pred_test_r', fontsize=16)
plt.grid(True)
plt.show()
The residuals are scattered along (y=0) and are independent of each other.
##Lasso Regression Model.
lasso = Lasso()
##List of alphas (lambda parameter)
params_1 = {'alpha': [0.00001, 0.0001, 0.0002, 0.0003, 0.0005, 0.0006, 0.0007,0.0008, 0.0009, 0.001, 0.002,0.003,0.004, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
9.0, 10.0, 20, 50, 100, 500, 1000 ]}
##Cross-Validation
folds = 5
lasso_model_cv = GridSearchCV(estimator = lasso,
param_grid = params_1,
scoring= 'neg_mean_absolute_error',
cv = folds,
return_train_score=True,
verbose = 1)
lasso_model_cv.fit(X_train, y_train)
Fitting 5 folds for each of 40 candidates, totalling 200 fits
GridSearchCV(cv=5, estimator=Lasso(),
param_grid={'alpha': [1e-05, 0.0001, 0.0002, 0.0003, 0.0005,
0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.002,
0.003, 0.004, 0.005, 0.01, 0.05, 0.1, 0.2,
0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0,
3.0, 4.0, 5.0, ...]},
return_train_score=True, scoring='neg_mean_absolute_error',
verbose=1)
##Display the mean scores
lasso_cv_results = pd.DataFrame(lasso_model_cv.cv_results_)
lasso_cv_results[['param_alpha', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])
| param_alpha | mean_train_score | mean_test_score | rank_test_score | |
|---|---|---|---|---|
| 9 | 0.001 | -0.210209 | -0.223918 | 1 |
| 8 | 0.0009 | -0.209909 | -0.224016 | 2 |
| 7 | 0.0008 | -0.209655 | -0.224161 | 3 |
| 6 | 0.0007 | -0.209419 | -0.224263 | 4 |
| 5 | 0.0006 | -0.209151 | -0.224317 | 5 |
| 10 | 0.002 | -0.213306 | -0.224366 | 6 |
| 4 | 0.0005 | -0.208900 | -0.224573 | 7 |
| 3 | 0.0003 | -0.208427 | -0.225085 | 8 |
| 2 | 0.0002 | -0.207882 | -0.225590 | 9 |
| 1 | 0.0001 | -0.207238 | -0.226455 | 10 |
| 11 | 0.003 | -0.217524 | -0.226752 | 11 |
| 0 | 0.00001 | -0.206977 | -0.227095 | 12 |
| 12 | 0.004 | -0.221471 | -0.230009 | 13 |
| 13 | 0.005 | -0.225425 | -0.233228 | 14 |
| 14 | 0.01 | -0.240006 | -0.243933 | 15 |
| 15 | 0.05 | -0.253709 | -0.256450 | 16 |
| 16 | 0.1 | -0.286503 | -0.288615 | 17 |
| 17 | 0.2 | -0.345715 | -0.347300 | 18 |
| 18 | 0.3 | -0.421508 | -0.423669 | 19 |
| 19 | 0.4 | -0.503736 | -0.506022 | 20 |
| 20 | 0.5 | -0.576048 | -0.577835 | 21 |
| 21 | 0.6 | -0.636055 | -0.637773 | 22 |
| 22 | 0.7 | -0.697506 | -0.699013 | 23 |
| 23 | 0.8 | -0.761580 | -0.762717 | 24 |
| 37 | 100 | -0.775470 | -0.775675 | 25 |
| 36 | 50 | -0.775470 | -0.775675 | 25 |
| 35 | 20 | -0.775470 | -0.775675 | 25 |
| 34 | 10.0 | -0.775470 | -0.775675 | 25 |
| 33 | 9.0 | -0.775470 | -0.775675 | 25 |
| 32 | 8.0 | -0.775470 | -0.775675 | 25 |
| 31 | 7.0 | -0.775470 | -0.775675 | 25 |
| 29 | 5.0 | -0.775470 | -0.775675 | 25 |
| 38 | 500 | -0.775470 | -0.775675 | 25 |
| 28 | 4.0 | -0.775470 | -0.775675 | 25 |
| 27 | 3.0 | -0.775470 | -0.775675 | 25 |
| 26 | 2.0 | -0.775470 | -0.775675 | 25 |
| 25 | 1.0 | -0.775470 | -0.775675 | 25 |
| 24 | 0.9 | -0.775470 | -0.775675 | 25 |
| 30 | 6.0 | -0.775470 | -0.775675 | 25 |
| 39 | 1000 | -0.775470 | -0.775675 | 25 |
##Plotting a magnified graph for a lower range of alpha.
lasso = Lasso()
##List of alphas (lambda parameter: consider smaller range on the basis of lasso_cv_results table ranking)
params_2 = {'alpha': [0.00001, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001,0.002,0.003, 0.005, 0.01, 0.02, 0.05]}
##Cross-Validation
folds = 5
lasso_model_cv = GridSearchCV(estimator = lasso,
param_grid = params_2,
scoring= 'neg_mean_absolute_error',
cv = folds,
return_train_score=True,
verbose = 1)
lasso_model_cv.fit(X_train, y_train)
Fitting 5 folds for each of 17 candidates, totalling 85 fits
GridSearchCV(cv=5, estimator=Lasso(),
param_grid={'alpha': [1e-05, 0.0001, 0.0002, 0.0003, 0.0004,
0.0005, 0.0006, 0.0007, 0.0008, 0.0009,
0.001, 0.002, 0.003, 0.005, 0.01, 0.02,
0.05]},
return_train_score=True, scoring='neg_mean_absolute_error',
verbose=1)
##ReDisplay the mean scores
lasso_cv_results = pd.DataFrame(lasso_model_cv.cv_results_)
lasso_cv_results[['param_alpha', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])
| param_alpha | mean_train_score | mean_test_score | rank_test_score | |
|---|---|---|---|---|
| 10 | 0.001 | -0.210209 | -0.223918 | 1 |
| 9 | 0.0009 | -0.209909 | -0.224016 | 2 |
| 8 | 0.0008 | -0.209655 | -0.224161 | 3 |
| 7 | 0.0007 | -0.209419 | -0.224263 | 4 |
| 6 | 0.0006 | -0.209151 | -0.224317 | 5 |
| 11 | 0.002 | -0.213306 | -0.224366 | 6 |
| 5 | 0.0005 | -0.208900 | -0.224573 | 7 |
| 4 | 0.0004 | -0.208683 | -0.224816 | 8 |
| 3 | 0.0003 | -0.208427 | -0.225085 | 9 |
| 2 | 0.0002 | -0.207882 | -0.225590 | 10 |
| 1 | 0.0001 | -0.207238 | -0.226455 | 11 |
| 12 | 0.003 | -0.217524 | -0.226752 | 12 |
| 0 | 0.00001 | -0.206977 | -0.227095 | 13 |
| 13 | 0.005 | -0.225425 | -0.233228 | 14 |
| 14 | 0.01 | -0.240006 | -0.243933 | 15 |
| 15 | 0.02 | -0.243076 | -0.245842 | 16 |
| 16 | 0.05 | -0.253709 | -0.256450 | 17 |
##Plotting mean test and train scoes with alpha
lasso_cv_results['param_alpha'] = lasso_cv_results['param_alpha'].astype('float64')
##plotting
plt.plot(lasso_cv_results['param_alpha'], lasso_cv_results['mean_train_score'])
plt.plot(lasso_cv_results['param_alpha'], lasso_cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper right')
plt.show()
# get the best estimator for lambda
lasso_model_cv.best_estimator_
Lasso(alpha=0.001)
# check the coefficient values with lambda = 0.001
lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train)
print("Intercept: ",lasso.intercept_)
print("Coefficients:\n ",lasso.coef_)
Intercept: -0.1260757628521606 Coefficients: [ 0.22562161 0.15995232 0.10487383 0.14147518 0.36496911 0.10209831 -0.2958012 0. 0.19745142 -0. -0.18270893 -0.31252446 0.08811376 0.31416549 0.18034293 -0. 0.21183154 -0.03766555 -0.1196319 -0.13312394 -0.12955484 -0.08127157 0. -0.08579786 0.16243639 -0.09499201 -0.07298262 0.20798766 0. -0. 0. 0. -0.03968562 -0. 0.012907 -0.01978675 0. 0.01400037 0.01925347 0.0406787 -0. -0. 0. -0. 0.01891555 -0.30591779 0.16465458 0. -0.08127922 -0.11444202]
##Making predictions for train and test sets: Lasso Regression Model
y_pred_train_l = lasso.predict(X_train)
y_pred_test_l = lasso.predict(X_test)
##R2 score for Lasso Regression Model
r2_score_lasso_train = r2_score(y_true= y_train, y_pred= y_pred_train_l)
r2_score_lasso_test = r2_score(y_true= y_test, y_pred= y_pred_test_l)
##Check the mean squared error (MSE) for Lasso Regression Model
MSE_lasso_train = mean_squared_error(y_train, y_pred_train_l)
MSE_lasso_test = mean_squared_error(y_test, y_pred_test_l)
##Mean Absolute error for train and test sets
MAE_lasso_train = mean_absolute_error(y_train, y_pred_train_l)
MAE_lasso_test = mean_absolute_error(y_test, y_pred_test_l)
##Root Mean Squared Error for Train and Test Sets
RMSE_lasso_train = np.sqrt(MSE_lasso_train)
RMSE_lasso_test = np.sqrt(MSE_lasso_test)
print("For Lasso Regression Model (Original Model: alpha=0.001):\n","*"*40)
print("\nFor Train Set:\nR2 score:",r2_score_lasso_train,"\nMSE score:",MSE_lasso_train,"\nMAE score:",MAE_lasso_train,\
"\nRMSE score:",RMSE_lasso_train)
print("\nFor Test Set:\nR2 score:",r2_score_lasso_test,"\nMSE score:",MSE_lasso_test,"\nMAE score:",MAE_lasso_test,\
"\nRMSE score:",RMSE_lasso_test,"\n","*"*40)
For Lasso Regression Model (Original Model: alpha=0.001): **************************************** For Train Set: R2 score: 0.9137483642080722 MSE score: 0.08625163579192789 MAE score: 0.21147969374732178 RMSE score: 0.29368628805568686 For Test Set: R2 score: 0.8927812619728864 MSE score: 0.10394150360566035 MAE score: 0.2152225550413117 RMSE score: 0.32239960236585335 ****************************************
##Creating a dataframe of features and coefficients
lasso_df = pd.DataFrame({'Features':X_train.columns, 'Coefficient':lasso.coef_.round(4), \
'Abs_Coefficient_Lasso(Desc_Sort)':abs(lasso.coef_.round(4))})
##Sorting coefficient in descending order of absolute values and reset index
lasso_df = lasso_df.sort_values(by='Abs_Coefficient_Lasso(Desc_Sort)', ascending=False)
lasso_df.reset_index(drop=True, inplace=True)
#lasso df
lasso_df.head(10) #Top10 features display
| Features | Coefficient | Abs_Coefficient_Lasso(Desc_Sort) | |
|---|---|---|---|
| 0 | GrLivArea | 0.3650 | 0.3650 |
| 1 | MSZoning_FV | 0.3142 | 0.3142 |
| 2 | MSSubClass_160 | -0.3125 | 0.3125 |
| 3 | Exterior1st_BrkComm | -0.3059 | 0.3059 |
| 4 | AgeofProperty | -0.2958 | 0.2958 |
| 5 | OverallQual | 0.2256 | 0.2256 |
| 6 | Neighborhood_Crawfor | 0.2118 | 0.2118 |
| 7 | Neighborhood_StoneBr | 0.2080 | 0.2080 |
| 8 | MSSubClass_70 | 0.1975 | 0.1975 |
| 9 | MSSubClass_90 | -0.1827 | 0.1827 |
##Coefficient value plot (Lasso Regression)
top10_lasso_df= lasso_df.loc[:9] #Lasso_df with top 10 coefficients
sns.set(style='white')
plt.figure(figsize=(16,8), dpi=120)
ax4= sns.barplot(y=top10_lasso_df['Features'], x=top10_lasso_df['Coefficient'], palette='Set1')
plt.xlabel('Coefficient Values', fontsize= 14, fontstyle='italic')
plt.ylabel('Features' , fontsize= 14, fontstyle='italic')
plt.title('Coefficents of Top 10 Features (Lasso Regression):', fontsize=18,fontweight='bold')
c= top10_lasso_df['Coefficient'] #Storing coefficient values
for index, value in enumerate(c):
plt.text(value, index, str(value), fontsize=13)
plt.grid(True)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.autoscale()
plt.tight_layout()
plt.show()
##Creating the list of top 5 features from Lasso Regression Model which we will be using later to answer 'Question 3'
top5_original_lasso_features = list(top10_lasso_df['Features'].iloc[0:5])
top5_original_lasso_features
['GrLivArea', 'MSZoning_FV', 'MSSubClass_160', 'Exterior1st_BrkComm', 'AgeofProperty']
Checking for the error terms distribution. They should be normally distributed (as it is one of the major assumptions of linear regression).
# Plot the histogram of the error terms
sns.set_style('white')
plt.figure(figsize=(5,5), dpi=100)
res1 = (y_train - y_pred_train_l) #residuals
sns.distplot(res1 , bins = 30, color='g')
plt.title('Error Terms: Distribution', fontweight='bold', fontsize = 18)
plt.xlabel('Errors', fontstyle='italic', fontsize = 12)
plt.grid(True)
plt.show()
# Plotting y_train and y_train_pred to understand the residuals.
sns.set_style('white')
plt.figure(figsize = (8,6))
plt.scatter(y_train,y_pred_train_l)
plt.title('y_train vs y_pred_train_l', fontweight='bold', fontsize = 20)
plt.xlabel('y_train', fontstyle='italic', fontsize = 16)
plt.ylabel('y_pred_train_l', fontstyle='italic', fontsize = 16)
plt.grid(True)
plt.show()
y_train.shape
(966,)
# Actual and Predicted (Train Set)
sns.set_style('white')
plt.figure(figsize = (10,4), dpi=120)
f1 = [i for i in range(0,966,1)] #Since y_train has 966 observations
g1 = [i for i in range(0,966,1)]
plt.plot(f1, y_pred_train_l, color="blue", linewidth=1, linestyle="-") #Predicted 'y_pred_train_l' plot
plt.plot(g1, y_train, color="red", linewidth=1, linestyle="-") #Actual 'y_train' plot
plt.title("Actual (y_train) and Predicted (y_pred_train_l): Train Set (Lasso)", fontsize=15, fontweight='bold')
plt.xlabel('Index', fontsize=15, fontstyle='italic')
plt.ylabel('SalePrice', fontsize=15, fontstyle='italic')
plt.show()
# Error terms for train set
sns.set_style('white')
plt.figure(figsize = (6,4), dpi=100)
j1 = [i for i in range(0,966,1)] #For 966 observations in the train set
res_train1 = (y_train-y_pred_train_l) #residuals (train set)
plt.scatter(j1, res_train1)
plt.title('Error Terms', fontsize=18, fontweight='bold')
plt.xlabel('Index', fontstyle='italic', fontsize=14)
plt.ylabel('y_train - y_pred_train_l', fontstyle='italic', fontsize=14)
plt.grid(True)
plt.show()
The residuals are scattered along (y=0) and are independent of each other.
# Plot the histogram of the error terms
sns.set_style('white')
plt.figure(figsize=(5,5), dpi=100)
res1 = (y_test - y_pred_test_l) #residuals
sns.distplot(res1 , bins = 30, color='g')
plt.title('Error Terms: Distribution', fontweight='bold', fontsize = 18)
plt.xlabel('Errors', fontstyle='italic', fontsize = 12)
plt.grid(True)
plt.show()
Error terms seem to be approximately normally distributed with mean 0, so our assumption holds true.
# Plotting y_test and y_pred_test_l to understand the spread.
sns.set_style('white')
fig = plt.figure(figsize=(6,4), dpi=100)
plt.scatter(y_test,y_pred_test_l)
fig.suptitle('y_test vs y_pred_test_l', fontsize=18)
plt.xlabel('y_test', fontsize=14)
plt.ylabel('y_pred_test_l', fontsize=14)
plt.grid(True)
plt.show()
y_test.shape
(415,)
# Actual vs Predicted (Test Set)
plt.figure(figsize = (10,4), dpi=120)
p2 = [i for i in range(0,415,1)] #As test set has 415 observations
l2 = [i for i in range(0,415,1)]
plt.plot(p2, y_pred_test_l, color="blue", linewidth=1, linestyle="-") #Predicted 'y_pred_test_l' plot(test pred)
plt.plot(l2, y_test, color="red", linewidth=1, linestyle="-") #Actual 'y_test' plot
plt.title('Actual (y_test) vs Predicted (y_pred_test_l): Test Set', fontsize=15, fontweight='bold')
plt.ylabel('SalePrice', fontsize=15, fontstyle='italic')
plt.xlabel('Index', fontsize=15, fontstyle='italic')
plt.show()
# Error terms
fig = plt.figure(figsize=(6,4), dpi=100)
o1 = [i for i in range(0,415,1)] #For 415 observations in the test set
res_test1 = (y_test-y_pred_test_l) #residuals test set
plt.scatter(o1,res_test1)
fig.suptitle('Error Terms', fontsize=20)
plt.xlabel('Index', fontsize=18)
plt.ylabel('y_test - y_pred_test_1', fontsize=16)
plt.grid(True)
plt.show()
The residuals are scattered along (y=0) and are independent of each other.
##Optimal Value of alpha for ridge and lasso regression (importing values already computed)
optimal_alpha_ridge = 8.0 #(Computed Above: For Ridge Regression)
optimal_alpha_lasso = 0.001 #(Computed Above: For Lasso Regression)
Changes to the model when we double the value of alpha for both ridge and lasso regression
##Checking the outcome: coefficient values with double the value of alpha = 8*2 = 16
ridge = Ridge(alpha=16)
ridge.fit(X_train, y_train)
print("Intercept: ", ridge.intercept_)
print("Coefficients:\n",ridge.coef_)
Intercept: -0.12966061145676755 Coefficients: [ 0.24224166 0.15628609 0.10582416 0.1456706 0.35751861 0.10847221 -0.27847847 0.02328501 0.15068923 0.00392081 -0.13762511 -0.19444553 0.08085395 0.20222076 0.16677873 -0.08084691 0.1565555 -0.05164921 -0.10782731 -0.1159138 -0.10064093 -0.07962394 -0.00295994 -0.07494303 0.11547805 -0.10354892 -0.06553052 0.12225787 0.01599864 -0.05475906 0.06061098 0.02551161 -0.0294213 -0.02358916 0.03091463 -0.01717715 0.05451808 0.04739797 0.04734031 0.05829194 -0.02658924 -0.0303841 0.04508568 -0.04973488 0.05122138 -0.07733566 0.11897601 0.02918533 -0.06086114 -0.10235333]
##Making predictions for train and test sets: Ridge Regression Model
y_pred_train_r = ridge.predict(X_train)
y_pred_test_r = ridge.predict(X_test)
##R2 score for Ridge Regression Model
r2_score_ridge_train = r2_score(y_true= y_train, y_pred= y_pred_train_r)
r2_score_ridge_test = r2_score(y_true= y_test, y_pred= y_pred_test_r)
##Check the mean squared error (MSE) for Ridge Regression Model
MSE_ridge_train = mean_squared_error(y_train, y_pred_train_r)
MSE_ridge_test = mean_squared_error(y_test, y_pred_test_r)
##Mean Absolute error for train and test sets
MAE_ridge_train = mean_absolute_error(y_train, y_pred_train_r)
MAE_ridge_test = mean_absolute_error(y_test, y_pred_test_r)
##Root Mean Squared Error for Train and Test Sets
RMSE_ridge_train = np.sqrt(MSE_ridge_train)
RMSE_ridge_test = np.sqrt(MSE_ridge_test)
print("For Ridge Regression Model (Doubled alpha model, alpha=8*2=16):\n","*"*40)
print("\nFor Train Set:\nR2 score:",r2_score_ridge_train,"\nMSE score:",MSE_ridge_train,"\nMAE score:",MAE_ridge_train,\
"\nRMSE score:",RMSE_ridge_train)
print("\nFor Test Set:\nR2 score:",r2_score_ridge_test,"\nMSE score:",MSE_ridge_test,"\nMAE score:",MAE_ridge_test,\
"\nRMSE score:",RMSE_ridge_test,"\n","*"*40)
For Ridge Regression Model (Doubled alpha model, alpha=8*2=16): **************************************** For Train Set: R2 score: 0.9118928405717794 MSE score: 0.08810715942822064 MAE score: 0.2126743189186682 RMSE score: 0.2968285017113765 For Test Set: R2 score: 0.8904731985528808 MSE score: 0.10617901905032019 MAE score: 0.21782052246333078 RMSE score: 0.32585122226304475 ****************************************
##Creating a dataframe of features and coefficients
ridge_df = pd.DataFrame({'Features':X_train.columns, 'Coefficient':ridge.coef_.round(4),
'Abs_Coefficient_Ridge(Desc_Sort)':abs(ridge.coef_.round(4))})
##Sorting coefficient in descending order of absolute values and reset index
ridge_df = ridge_df.sort_values(by='Abs_Coefficient_Ridge(Desc_Sort)', ascending=False)
ridge_df.reset_index(drop=True, inplace=True)
#Dataframe rdige_df
ridge_df.head(10) #Top10 features display
| Features | Coefficient | Abs_Coefficient_Ridge(Desc_Sort) | |
|---|---|---|---|
| 0 | GrLivArea | 0.3575 | 0.3575 |
| 1 | AgeofProperty | -0.2785 | 0.2785 |
| 2 | OverallQual | 0.2422 | 0.2422 |
| 3 | MSZoning_FV | 0.2022 | 0.2022 |
| 4 | MSSubClass_160 | -0.1944 | 0.1944 |
| 5 | MSZoning_RL | 0.1668 | 0.1668 |
| 6 | Neighborhood_Crawfor | 0.1566 | 0.1566 |
| 7 | OverallCond | 0.1563 | 0.1563 |
| 8 | MSSubClass_70 | 0.1507 | 0.1507 |
| 9 | TotalBsmtSF | 0.1457 | 0.1457 |
##Coefficient value plot (Ridge Regression)
top10_ridge_df= ridge_df.loc[:9] #Ridge_df with top 10 coefficients
sns.set(style='white')
plt.figure(figsize=(16,8), dpi=120)
ax3= sns.barplot(y=top10_ridge_df['Features'], x=top10_ridge_df['Coefficient'], palette='Set1')
plt.xlabel('Coefficient Values', fontsize= 14, fontstyle='italic')
plt.ylabel('Features' , fontsize= 14, fontstyle='italic')
plt.title('Coefficents of Top 10 Features (Ridge Regression):[Doubled alpha model, alpha=6*2=12]', fontsize=18,fontweight='bold')
coef= top10_ridge_df['Coefficient'] #Storing coefficient values
for index, value in enumerate(coef):
plt.text(value, index, str(value), fontsize=13)
plt.grid(True)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.autoscale()
plt.tight_layout()
plt.show()
print("For Ridge Regression (Doubled alpha model, alpha=8*2=16): \n","*"*125)
print("The most important top10 predictor variables after the change is implemented are as follows:\n\n",\
list(top10_ridge_df['Features']),"\n", "*"*125)
For Ridge Regression (Doubled alpha model, alpha=8*2=16): ***************************************************************************************************************************** The most important top10 predictor variables after the change is implemented are as follows: ['GrLivArea', 'AgeofProperty', 'OverallQual', 'MSZoning_FV', 'MSSubClass_160', 'MSZoning_RL', 'Neighborhood_Crawfor', 'OverallCond', 'MSSubClass_70', 'TotalBsmtSF'] *****************************************************************************************************************************
##Checking the outcome: coefficient values with double the value of alpha = 0.001*2 = 0.002 (For Lasso Regression)
lasso = Lasso(alpha=0.002)
lasso.fit(X_train, y_train)
print("Intercept: ", lasso.intercept_)
print("Coefficients:\n ", lasso.coef_)
Intercept: -0.14421116610261947 Coefficients: [ 0.24515523 0.15904187 0.10325384 0.14258939 0.36425581 0.10472228 -0.29690021 0. 0.17230256 -0. -0.15993685 -0.26990325 0.06157464 0.28056602 0.18252235 -0. 0.21391105 -0.00119804 -0.04132858 -0.03617834 -0.07573251 -0.04201532 -0. -0.05123676 0.12337558 -0.04024964 -0.02299667 0.13158657 0. -0. 0. -0. -0.03821582 -0. 0.00132769 -0.01656393 0. 0. 0.01854981 0.02838593 -0. -0. 0. -0. 0. -0. 0.12763467 0. -0.00616419 -0.09043783]
##RMaking predictions for test and train sets: Lasso Regression Model
y_pred_train_l = lasso.predict(X_train)
y_pred_test_l = lasso.predict(X_test)
##R2 score for Lasso Regression Model
r2_score_lasso_train = r2_score(y_true= y_train, y_pred= y_pred_train_l)
r2_score_lasso_test = r2_score(y_true= y_test, y_pred= y_pred_test_l)
##Check the mean squared error (MSE) for Lasso Regression Model
MSE_lasso_train = mean_squared_error(y_train, y_pred_train_l)
MSE_lasso_test = mean_squared_error(y_test, y_pred_test_l)
##Mean Absolute error for train and test sets
MAE_lasso_train = mean_absolute_error(y_train, y_pred_train_l)
MAE_lasso_test = mean_absolute_error(y_test, y_pred_test_l)
##Root Mean Squared Error for Train and Test Sets
RMSE_lasso_train = np.sqrt(MSE_lasso_train)
RMSE_lasso_test = np.sqrt(MSE_lasso_test)
print("For Lasso Regression Model: (Doubled alpha model: alpha:0.001*2 = 0.002)\n","*"*40)
print("\nFor Train Set:\nR2 score:",r2_score_lasso_train,"\nMSE score:",MSE_lasso_train,"\nMAE score:",MAE_lasso_train,\
"\nRMSE score:",RMSE_lasso_train)
print("\nFor Test Set:\nR2 score:",r2_score_lasso_test,"\nMSE score:",MSE_lasso_test,"\nMAE score:",MAE_lasso_test,\
"\nRMSE score:",RMSE_lasso_test,"\n","*"*40)
For Lasso Regression Model: (Doubled alpha model: alpha:0.001*2 = 0.002) **************************************** For Train Set: R2 score: 0.9103228278462835 MSE score: 0.08967717215371657 MAE score: 0.21401154178442824 RMSE score: 0.2994614702323432 For Test Set: R2 score: 0.8920884694594363 MSE score: 0.10461312031053645 MAE score: 0.21611414123017147 RMSE score: 0.3234395156911667 ****************************************
##Creating a dataframe of features and coefficients
lasso_df = pd.DataFrame({'Features':X_train.columns, 'Coefficient':lasso.coef_.round(4), \
'Abs_Coefficient_Lasso(Desc_Sort)':abs(lasso.coef_.round(4))})
##Sorting coefficient in descending order of absolute values and reset index
lasso_df = lasso_df.sort_values(by='Abs_Coefficient_Lasso(Desc_Sort)', ascending=False)
lasso_df.reset_index(drop=True, inplace=True)
#lasso df
lasso_df.head(10) #Top10 features display
| Features | Coefficient | Abs_Coefficient_Lasso(Desc_Sort) | |
|---|---|---|---|
| 0 | GrLivArea | 0.3643 | 0.3643 |
| 1 | AgeofProperty | -0.2969 | 0.2969 |
| 2 | MSZoning_FV | 0.2806 | 0.2806 |
| 3 | MSSubClass_160 | -0.2699 | 0.2699 |
| 4 | OverallQual | 0.2452 | 0.2452 |
| 5 | Neighborhood_Crawfor | 0.2139 | 0.2139 |
| 6 | MSZoning_RL | 0.1825 | 0.1825 |
| 7 | MSSubClass_70 | 0.1723 | 0.1723 |
| 8 | MSSubClass_90 | -0.1599 | 0.1599 |
| 9 | OverallCond | 0.1590 | 0.1590 |
##Coefficient value plot (Lasso Regression)
top10_lasso_df= lasso_df.loc[:9] #Lasso_df with top 10 coefficients
sns.set(style='white')
plt.figure(figsize=(16,8), dpi=120)
ax4= sns.barplot(y=top10_lasso_df['Features'], x=top10_lasso_df['Coefficient'], palette='Set1')
plt.xlabel('Coefficient Values', fontsize= 14, fontstyle='italic')
plt.ylabel('Features' , fontsize= 14, fontstyle='italic')
plt.title('Coefficents of Top 10 Features (Lasso Regression):[Doubled alpha model, alpha=0.001*2=0.002]', fontsize=18,fontweight='bold')
c= top10_lasso_df['Coefficient'] #Storing coefficient values
for index, value in enumerate(c):
plt.text(value, index, str(value), fontsize=13)
plt.grid(True)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.autoscale()
plt.tight_layout()
plt.show()
print("For Ridge Regression (Doubled alpha model, alpha=8*2=16): \n","*"*125)
print("The most important top10 predictor variables after the change is implemented are as follows:\n\n",\
list(top10_ridge_df['Features']),"\n", "*"*125)
For Ridge Regression (Doubled alpha model, alpha=8*2=16): ***************************************************************************************************************************** The most important top10 predictor variables after the change is implemented are as follows: ['GrLivArea', 'AgeofProperty', 'OverallQual', 'MSZoning_FV', 'MSSubClass_160', 'MSZoning_RL', 'Neighborhood_Crawfor', 'OverallCond', 'MSSubClass_70', 'TotalBsmtSF'] *****************************************************************************************************************************
##Checking the outcome: coefficient values with double the value of alpha = 0.001*2 = 0.002 (For Lasso Regression)
lasso = Lasso(alpha=0.002)
lasso.fit(X_train, y_train)
print("Intercept: ", lasso.intercept_)
print("Coefficients:\n ", lasso.coef_)
Intercept: -0.14421116610261947 Coefficients: [ 0.24515523 0.15904187 0.10325384 0.14258939 0.36425581 0.10472228 -0.29690021 0. 0.17230256 -0. -0.15993685 -0.26990325 0.06157464 0.28056602 0.18252235 -0. 0.21391105 -0.00119804 -0.04132858 -0.03617834 -0.07573251 -0.04201532 -0. -0.05123676 0.12337558 -0.04024964 -0.02299667 0.13158657 0. -0. 0. -0. -0.03821582 -0. 0.00132769 -0.01656393 0. 0. 0.01854981 0.02838593 -0. -0. 0. -0. 0. -0. 0.12763467 0. -0.00616419 -0.09043783]
##RMaking predictions for test and train sets: Lasso Regression Model
y_pred_train_l = lasso.predict(X_train)
y_pred_test_l = lasso.predict(X_test)
##R2 score for Lasso Regression Model
r2_score_lasso_train = r2_score(y_true= y_train, y_pred= y_pred_train_l)
r2_score_lasso_test = r2_score(y_true= y_test, y_pred= y_pred_test_l)
##Check the mean squared error (MSE) for Lasso Regression Model
MSE_lasso_train = mean_squared_error(y_train, y_pred_train_l)
MSE_lasso_test = mean_squared_error(y_test, y_pred_test_l)
##Mean Absolute error for train and test sets
MAE_lasso_train = mean_absolute_error(y_train, y_pred_train_l)
MAE_lasso_test = mean_absolute_error(y_test, y_pred_test_l)
##Root Mean Squared Error for Train and Test Sets
RMSE_lasso_train = np.sqrt(MSE_lasso_train)
RMSE_lasso_test = np.sqrt(MSE_lasso_test)
print("For Lasso Regression Model: (Doubled alpha model: alpha:0.001*2 = 0.002)\n","*"*40)
print("\nFor Train Set:\nR2 score:",r2_score_lasso_train,"\nMSE score:",MSE_lasso_train,"\nMAE score:",MAE_lasso_train,\
"\nRMSE score:",RMSE_lasso_train)
print("\nFor Test Set:\nR2 score:",r2_score_lasso_test,"\nMSE score:",MSE_lasso_test,"\nMAE score:",MAE_lasso_test,\
"\nRMSE score:",RMSE_lasso_test,"\n","*"*40)
For Lasso Regression Model: (Doubled alpha model: alpha:0.001*2 = 0.002) **************************************** For Train Set: R2 score: 0.9103228278462835 MSE score: 0.08967717215371657 MAE score: 0.21401154178442824 RMSE score: 0.2994614702323432 For Test Set: R2 score: 0.8920884694594363 MSE score: 0.10461312031053645 MAE score: 0.21611414123017147 RMSE score: 0.3234395156911667 ****************************************
##Creating a dataframe of features and coefficients
lasso_df = pd.DataFrame({'Features':X_train.columns, 'Coefficient':lasso.coef_.round(4), \
'Abs_Coefficient_Lasso(Desc_Sort)':abs(lasso.coef_.round(4))})
##Sorting coefficient in descending order of absolute values and reset index
lasso_df = lasso_df.sort_values(by='Abs_Coefficient_Lasso(Desc_Sort)', ascending=False)
lasso_df.reset_index(drop=True, inplace=True)
#lasso df
lasso_df.head(10) #Top10 features display
| Features | Coefficient | Abs_Coefficient_Lasso(Desc_Sort) | |
|---|---|---|---|
| 0 | GrLivArea | 0.3643 | 0.3643 |
| 1 | AgeofProperty | -0.2969 | 0.2969 |
| 2 | MSZoning_FV | 0.2806 | 0.2806 |
| 3 | MSSubClass_160 | -0.2699 | 0.2699 |
| 4 | OverallQual | 0.2452 | 0.2452 |
| 5 | Neighborhood_Crawfor | 0.2139 | 0.2139 |
| 6 | MSZoning_RL | 0.1825 | 0.1825 |
| 7 | MSSubClass_70 | 0.1723 | 0.1723 |
| 8 | MSSubClass_90 | -0.1599 | 0.1599 |
| 9 | OverallCond | 0.1590 | 0.1590 |
##Coefficient value plot (Lasso Regression)
top10_lasso_df= lasso_df.loc[:9] #Lasso_df with top 10 coefficients
sns.set(style='white')
plt.figure(figsize=(16,8), dpi=120)
ax4= sns.barplot(y=top10_lasso_df['Features'], x=top10_lasso_df['Coefficient'], palette='Set1')
plt.xlabel('Coefficient Values', fontsize= 14, fontstyle='italic')
plt.ylabel('Features' , fontsize= 14, fontstyle='italic')
plt.title('Coefficents of Top 10 Features (Lasso Regression):[Doubled alpha model, alpha=0.001*2=0.002]', fontsize=18,fontweight='bold')
c= top10_lasso_df['Coefficient'] #Storing coefficient values
for index, value in enumerate(c):
plt.text(value, index, str(value), fontsize=13)
plt.grid(True)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.autoscale()
plt.tight_layout()
plt.show()
print("For Lasso Regression (Doubled alpha model: alpha:0.001*2 = 0.002): \n","*"*125)
print("The most important top10 predictor variables after the change is implemented are as follows:\n\n",\
list(top10_lasso_df['Features']),"\n", "*"*125)
For Lasso Regression (Doubled alpha model: alpha:0.001*2 = 0.002): ***************************************************************************************************************************** The most important top10 predictor variables after the change is implemented are as follows: ['GrLivArea', 'AgeofProperty', 'MSZoning_FV', 'MSSubClass_160', 'OverallQual', 'Neighborhood_Crawfor', 'MSZoning_RL', 'MSSubClass_70', 'MSSubClass_90', 'OverallCond'] *****************************************************************************************************************************